In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# ignore possible warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('datasets/survey_results_public.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# df = df[['Country','EdLevel','YearsCode','Employment','RemoteWork','ConvertedCompYearly']]
df = df[['Country','EdLevel','YearsCode','Employment','ConvertedCompYearly']]
df = df.rename({"ConvertedCompYearly" : "Salary"}, axis = 1)

In [None]:
df

In [None]:
# null counts in each column
df.isnull().sum()

In [None]:
#drop based on NaN salary                                  # NO NEED
# df = df[df['Salary'].notnull()]

In [None]:
# df

In [None]:
#drop anywhere NaN
df.dropna(inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Employment'].unique()

In [None]:
df = df[df['Employment'] == 'Employed, full-time']

In [None]:
df

In [None]:
# drop column Employment for further process
df = df.drop('Employment', axis = 1)

In [None]:
df

In [None]:
df.info()

In [None]:
df['Country'].value_counts()

In [None]:
#functin to remove the countries with low values as may cause problem in algorithmn 
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'other'
    return categorical_map

In [None]:
country_map = shorten_categories(df.Country.value_counts(), 300)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

In [None]:
fig, ax = plt.subplots(1,1, figsize = (12,7))
df.boxplot('Salary', 'Country', ax = ax)
plt.suptitle('Salary (US$) v Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation = 90)
plt.show()

In [None]:
df = df[df['Salary'] <= 150000]
df = df[df['Salary'] >= 10000]
df = df[df['Country'] != 'Other']

In [None]:
df

In [None]:
fig, ax = plt.subplots(1,1, figsize = (12,7))
df.boxplot('Salary', 'Country', ax = ax)
plt.suptitle('Salary (US$) v Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation = 90)
plt.show()

In [None]:
df['YearsCode'].unique()

In [None]:
# function to clean the years code (1 will be 0.5 and >50 will be 50)
def clean_experience(x):
    if x == 'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    
    return float(x)

df['YearsCode'] = df['YearsCode'].apply(clean_experience)

In [None]:
df['YearsCode'].unique()

In [None]:
df['EdLevel'].unique()

In [None]:
# function to clean the education level
def clean_education(x):
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [None]:
df['EdLevel'].unique()

## making unique values for education and country (to create a int)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le_education = LabelEncoder()
df['EdLevel'] = le_education.fit_transform(df['EdLevel'])
df['EdLevel'].unique()

In [None]:
le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])
df['Country'].unique()

In [None]:
df

# train the model

In [None]:
y = df['Salary']
X = df.drop('Salary', axis = 1)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)

In [None]:
y_pred = linear_reg.predict(X)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [None]:
error = np.sqrt(mean_squared_error(y,y_pred))

In [None]:
error

### Second model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dec_tree_reg = DecisionTreeRegressor(random_state = 0)
dec_tree_reg.fit(X,y.values)

In [None]:
y_pred = dec_tree_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y,y_pred))
print("${:,.02f}".format(error))

## third model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
random_forest_reg = RandomForestRegressor(random_state = 0)
random_forest_reg.fit(X,y.values)

In [None]:
y_pred = random_forest_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y,y_pred))
print("${:,.02f}".format(error))

## forth model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
max_depth = [None, 2, 4, 6, 8, 10, 12]
parameters = {"max_depth" : max_depth}

regressor = DecisionTreeRegressor(random_state = 0)
gs = GridSearchCV(regressor, parameters, scoring = 'neg_mean_squared_error')
gs.fit(X, y.values)

In [None]:
regressor = gs.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y,y_pred))
print("${:,.02f}".format(error))

In [None]:
X

In [None]:
# country, edlevel, yearscode
X = np.array([['United States of America','Master’s degree',15]])
X

In [None]:
X[:, 0] = le_country.transform(X[:, 0])
X[:, 1] = le_education.transform(X[:, 1])
X = X.astype(float)
X

In [None]:
y_pred = regressor.predict(X)
y_pred

# extracing model

In [None]:
import pickle

In [None]:
data = {
        'model': regressor,
        'le_country' : le_country,
        'le_education' : le_education
       } 
with open ('saved_steps.pkl','wb') as file:
    pickle.dump(data, file)

In [None]:
with open('saved_steps.pkl','rb') as file:
    data = pickle.load(file)

regressor_loaded = data['model']
le_country = data['le_country']
le_education = data['le_education']

In [None]:
y_pred = regressor_loaded.predict(X)
y_pred