In [1]:
import pandas as pd
import json

In [51]:
with open('salaryData.json') as f:
    data = json.load(f)

dataset = pd.DataFrame(data)

# columns not used in model
columns = ['timestamp', 'otherdetails', 'dmaid', 'rowNumber', 
           'yearsatcompany', 'company', 'gender', 'cityid',
           'level', 'tag', 'bonus', 'basesalary', 'stockgrantvalue'] # TODO: consider adding these to model
dataset.drop(columns, inplace=True, axis=1)
dataset

Unnamed: 0,title,totalyearlycompensation,location,yearsofexperience
0,Product Manager,127,"Redwood City, CA",1.5
1,Software Engineer,100,"San Francisco, CA",5
2,Product Manager,310,"Seattle, WA",8
3,Software Engineering Manager,200,"Redmond, WA",9
4,Software Engineer,173,"Vancouver, BC, Canada",11
...,...,...,...,...
33718,Software Engineer,197,"London, EN, United Kingdom",5
33719,Software Engineer,250,"Austin, TX",5
33720,Product Designer,260,"Mountain View, CA",5
33721,Hardware Engineer,260,"Austin, TX",16


In [64]:
# filter out cities with less than 100 datapoints
dataset_top_cities = dataset.groupby('location').filter(lambda x: len(x) >= 100)

In [102]:
# categorical data
# categorical_cols = ['title', 'location'] 

# one hot encoding
# dataset_encoded = pd.get_dummies(dataset_top_cities, columns = categorical_cols, drop_first = True)
# dataset_encoded

In [147]:
# multivariate linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# independent and dependent variables
features = ['yearsofexperience', 'location', 'title']
target = 'totalyearlycompensation'

X = dataset_top_cities[features]
Y = dataset_top_cities[target]

In [148]:
e = OneHotEncoder(drop='first')

X = e.fit_transform(X).toarray()

# 80/20 split- 20% training data
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state=0)

In [149]:
# define model I am using
model = LinearRegression()

In [152]:
# training process
model.fit(X_train,Y_train)

LinearRegression()

In [153]:
# mean absolute value for training data
data = Y_train
predict = model.predict(X_train)
training_error = mean_absolute_error(data, predict)

In [154]:
# mean absolute value for test data
test_data = Y_test
predict_data = model.predict(X_test)
test_data_error = mean_absolute_error(data, predict)

In [155]:
# we need some metric to measure the accuracy of our regression model
from sklearn.metrics import r2_score

# on training data
true_value = Y_train
predicted_val = model.predict(X_train)
accuracy = r2_score(true_value, predicted_val)

In [157]:
# on test data
true_value2 = Y_test
predicted_val2 = model.predict(X_test)
accuracy2 = r2_score(true_value2, predicted_val2)

In [158]:
print('This model accounts for {}% of the training data with mean data error of {}'.format(round(accuracy2*100,2), round(training_error,2)))
print('This model accounts for {}% of the testing data with mean data error of {}'.format(round(accuracy*100,2), round(test_data_error,2)))

This model accounts for -1.8218516691429687e+20% of the training data with mean data error of 66.42
This model accounts for 39.49% of the testing data with mean data error of 66.42
