In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns



df= pd.read_csv("survey_results_public.csv")

In [2]:
df['EdLevel'].fillna('Something else', axis=0, inplace=True)

In [3]:
 df.drop(['US_State', 'UK_Country', 'ResponseId', 'CompTotal'], axis=1, inplace=True)

In [4]:
df['Country'] = df['Country'].replace(['United Kingdom of Great Britain and Northern Ireland'], 'United Kingdom')

In [5]:
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)

In [6]:
df = df[df['Country'].isin(['Slovakia', 'Netherlands','Russian Federation', 'Austria', 'United Kingdom',
                           'Sweden','Spain','Germany','France', 'Switzerland', 'Poland', 'Ukraine','Portugal', 'Italy', 'Bulgaria', 'Greece',
                           'Ireland','Hungary', 'Belgium','Albania','Romania','Lithuania', 'Slovenia','Croatia','Czech Republic','Denmark',
                            'Serbia','Estonia','Finland','Bosnia and Herzegovina','Norway','Belarus','Luxembourg','Malta','Cyprus',
                            'Latvia','Iceland','Republic of Moldova','Montenegro','Monaco','Liechtenstein'])]

In [7]:
df = df[['EdLevel','Country', 'YearsCodePro',"Employment", 'Age','Age1stCode', 'Salary']]

In [8]:
df = df[df["Employment"]== "Employed full-time"]

In [9]:
df = df.dropna()

In [10]:
df = df[df["Salary"] <=250000]
df = df[df["Salary"] >= 10000]

In [11]:
def experience(x):
    if x == 'More than 50 years':
        return 55
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(experience)

In [12]:
def education (x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral degree' in x:
        return 'Post Grad'
    return 'Less than a Bachelors'

df["EdLevel"] = df['EdLevel'].apply(education)

In [13]:
df = df.drop("Employment", axis=1)
df = df.drop(['Age1stCode'], axis=1)

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
le_ed = LabelEncoder()
df['EdLevel'] = le_ed.fit_transform(df['EdLevel'])

In [16]:
le_age = LabelEncoder()
df['Age'] = le_age.fit_transform(df['Age'])

In [17]:
le_coun = LabelEncoder()
df['Country'] = le_coun.fit_transform(df['Country'])

In [18]:
X = df.drop("Salary", axis=1)
y = df["Salary"]

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
rfr1 = RandomForestRegressor(random_state=42,
                            n_estimators=600,
                        
                            max_depth=150,
                        
                            bootstrap=True,
                            max_features='sqrt')
                           
rfr1.fit(X_train.values, y_train.values)

In [27]:
pred_rfr = rfr1.predict(X_test.values)

In [28]:
print("RMSE: " + str(round(sqrt(mean_squared_error(y_test, pred_rfr)),2)))
print("R_squared: " + str(round(r2_score(y_test,pred_rfr),2)))

RMSE: 26342.26
R_squared: 0.34


In [29]:
Xn = np.array([["Master’s degree","Switzerland", 10, "35-44 years old" ]])

In [30]:
Xn

array([['Master’s degree', 'Switzerland', '10', '35-44 years old']],
      dtype='<U21')

In [31]:
Xn[:, 1] = le_coun.transform(Xn[:,1])
Xn[:, 0] = le_ed.transform(Xn[:,0])
Xn[:, 3] = le_age.transform(Xn[:,3])
Xn = Xn.astype(float)
Xn

array([[ 2., 37., 10.,  2.]])

In [32]:
y_pred_k = rfr1.predict(Xn)
y_pred_k

array([118156.86481926])

In [33]:
import pickle

In [34]:
rfmod = {"model":rfr1, "le_coun":le_coun, "le_ed":le_ed, "le_age":le_age}
with open('salary_mod1.pkl', 'wb') as file1:
    pickle.dump(rfmod, file1)

In [35]:
with open('salary_mod1.pkl', 'rb') as file1:
    rfmod = pickle.load(file1)
    
rf = rfmod["model"]
le_coun = rfmod["le_coun"]
le_ed = rfmod["le_ed"]
le_age = rfmod["le_age"]

In [36]:
val_test = rfr1.predict(Xn)
val_test

array([118156.86481926])