# Initial Python Setup

In [None]:
import pandas as pd
import numpy as np
import requests as req

from matplotlib import pyplot as plot

from zipfile import ZipFile
from io import BytesIO

# Setting Variables
Here we are just setting variables (such as the url, file name, etc) to tidy up the importing code

In [None]:
url = 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip'
file_name = 'survey_results_public.csv'

# Importing the Dataset

In [None]:
response = req.get(url)
zip_file = ZipFile(BytesIO(response.content))
df = pd.read_csv(BytesIO(zip_file.read(file_name)))
display(df)

# Cleaning the Data

In order to use this dataset, we will need to clean it up and interpret some of the variables

## Dummy Variables

Many of the variables in the data are multi-selected lists, where the user can choose none, any, or all of the options. These are stored as a list seperated by a `;` deliminer.

We can use the dataframe method

```python
df.str.get_dummies(';')
```

to split these up into dummy variables

In [None]:
dummy_languages = df["LanguageHaveWorkedWith"].str.get_dummies(';').add_prefix("[Language] ")
dummy_devtype = df["DevType"].str.get_dummies(";").add_prefix("[DevType] ")
dummy_learncodeoffline = df["LearnCode"].str.get_dummies(";").add_prefix("[LearnCode] ")
dummy_learncodeonline = df["LearnCodeOnline"].str.get_dummies(";").add_prefix("[LearnCode] ")

df1 = pd.concat([df,dummy_languages, dummy_devtype, dummy_learncodeoffline, dummy_learncodeonline], axis = 1)
display(df1)

By keeping the dummy dataframe variables, we can easily grab all of the columns using the following method

```python
dummy_learncodeoffline.columns
```

This will make it significantly easier in order to grab variables we want when regressing