# Initial Python Setup

In [None]:
import pandas as pd
import requests as req
import os

from matplotlib import pyplot as plt

from zipfile import ZipFile
from io import BytesIO
from IPython.display import display

# Setting Variables
Here we are just setting variables (such as the url, file name, etc) to tidy up the importing code

In [None]:
url = 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip'
file_name = 'survey_results_public.csv'

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 75)

# Importing the Dataset

In [None]:
response = req.get(url)
zip_file = ZipFile(BytesIO(response.content))
df = pd.read_csv(BytesIO(zip_file.read(file_name)))
display(df)

In [None]:
display(df[["ConvertedCompYearly", "Currency", "CompTotal", "CompFreq"]])
plt.plot(df["ConvertedCompYearly"])

# Cleaning the Data

In order to use this dataset, we will need to clean it up and interpret some of the variables

## Dummy Variables

Many of the variables in the data are multi-selected lists, where the user can choose none, any, or all of the options. These are stored as a list seperated by a `;` deliminer.

We can use the dataframe method

```python
df.str.get_dummies(';')
```

to split these up into dummy variables

In [None]:
dummy_languages = df["LanguageHaveWorkedWith"].str.get_dummies(';')
dummy_learncodeoffline = df["LearnCode"].str.get_dummies(";").add_prefix("Learn From ")
dummy_learncodeonline = df["LearnCodeOnline"].str.get_dummies(";").add_prefix("Learn From ")
dummy_devtype = df["DevType"].str.get_dummies(";").add_prefix("Dev Type: ")

df = pd.concat([df,dummy_languages, dummy_learncodeoffline, dummy_learncodeonline, dummy_devtype], axis = 1)
display(df)

In [None]:
display(df[list(dummy_learncodeoffline.columns) + list(dummy_learncodeonline.columns)])

## Looking at when any of the currency variables are NaN or not NaN

In [None]:
display(df[df["ConvertedCompYearly"].isna()][df["CompTotal"].notna()][["ConvertedCompYearly", "Currency", "CompTotal", "CompFreq"]])

In [None]:
display(df[df["ConvertedCompYearly"].notna()][["ConvertedCompYearly", "Currency", "CompTotal", "CompFreq"]])

In [None]:
display(df[df["ConvertedCompYearly"].notna()][df["Currency"] == "USD\tUnited States dollar"][["ConvertedCompYearly", "Currency", "CompTotal", "CompFreq"]])

## Filtering to United States


In [None]:
df1 = df[df["Country"] == "United States of America"]
display(df1)

In [None]:
display(df1[["Currency","ConvertedCompYearly","CompFreq"]][(df1["Currency"] != "USD\tUnited States dollar" ) & (df1["Currency"].notna())])

## Filtering to only USD

In [None]:
df2 = df1[df1["Currency"] == "USD\tUnited States dollar"]
display(df2[["Currency","ConvertedCompYearly","CompFreq","CompTotal"]])

In [None]:
display(df2[["Currency","ConvertedCompYearly","CompFreq","CompTotal"]][df2["ConvertedCompYearly"].isna() & (df2["CompTotal"].notna()) & df2["CompFreq"].notna()])

# Not filtering for USD, but filtering to United States where ConvertedCompYearly is not NaN

In [None]:
# so the converted year comp total is just like, taking out all of the excess values
df3 = df1[df1["ConvertedCompYearly"].notna()]
display(df3[["ConvertedCompYearly","EdLevel"] + list(dummy_languages.columns) + list(dummy_devtype.columns)])
plt.plot(df3["ConvertedCompYearly"])

In [None]:
# if not os.path.exists("tmp"):
#  os.mkdir("tmp")
# df_out = df3[["ConvertedCompYearly","EdLevel"] + list(dummy_languages.columns) + list(dummy_devtype.columns)]
#df_out.to_csv(os.path.join("tmp","data_out.csv"))