## Imports and CSV Reading

In [503]:
# Import necessary libraries
import math
import statistics as stat
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector

# Load the dataset
file_path = "../Data/Latest_Data_Science_Salaries.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Job Title,Employment Type,Experience Level,Expertise Level,Salary,Salary Currency,Company Location,Salary in USD,Employee Residence,Company Size,Year
0,Data Engineer,Full-Time,Senior,Expert,210000,United States Dollar,United States,210000,United States,Medium,2023
1,Data Engineer,Full-Time,Senior,Expert,165000,United States Dollar,United States,165000,United States,Medium,2023
2,Data Engineer,Full-Time,Senior,Expert,185900,United States Dollar,United States,185900,United States,Medium,2023
3,Data Engineer,Full-Time,Senior,Expert,129300,United States Dollar,United States,129300,United States,Medium,2023
4,Data Scientist,Full-Time,Senior,Expert,140000,United States Dollar,United States,140000,United States,Medium,2023


## Clean the Data

In [633]:
# Remove values of Employment Type that are not listed as Full-Time
# Remove values of Company Location that are not listed as United States
new_df = df[(df['Employment Type'] == 'Full-Time') & (df['Employee Residence'] == 'United States')]
new_df = new_df.reset_index(drop = True)
new_df

Unnamed: 0,Job Title,Employment Type,Experience Level,Expertise Level,Salary,Salary Currency,Company Location,Salary in USD,Employee Residence,Company Size,Year
0,Data Engineer,Full-Time,Senior,Expert,210000,United States Dollar,United States,210000,United States,Medium,2023
1,Data Engineer,Full-Time,Senior,Expert,165000,United States Dollar,United States,165000,United States,Medium,2023
2,Data Engineer,Full-Time,Senior,Expert,185900,United States Dollar,United States,185900,United States,Medium,2023
3,Data Engineer,Full-Time,Senior,Expert,129300,United States Dollar,United States,129300,United States,Medium,2023
4,Data Scientist,Full-Time,Senior,Expert,140000,United States Dollar,United States,140000,United States,Medium,2023
...,...,...,...,...,...,...,...,...,...,...,...
2437,Applied Machine Learning Scientist,Full-Time,Mid,Intermediate,423000,United States Dollar,United States,423000,United States,Large,2021
2438,Data Specialist,Full-Time,Senior,Expert,165000,United States Dollar,United States,165000,United States,Large,2021
2439,Data Scientist,Full-Time,Senior,Expert,412000,United States Dollar,United States,412000,United States,Large,2020
2440,Principal Data Scientist,Full-Time,Mid,Intermediate,151000,United States Dollar,United States,151000,United States,Large,2021


In [634]:
# Drop Expertise Level, Salary Currency, and Salary in USD columns
df_clean = new_df.drop(columns = ['Employment Type', 'Expertise Level', 'Salary Currency', 'Employee Residence', 'Salary in USD'])
df_clean

Unnamed: 0,Job Title,Experience Level,Salary,Company Location,Company Size,Year
0,Data Engineer,Senior,210000,United States,Medium,2023
1,Data Engineer,Senior,165000,United States,Medium,2023
2,Data Engineer,Senior,185900,United States,Medium,2023
3,Data Engineer,Senior,129300,United States,Medium,2023
4,Data Scientist,Senior,140000,United States,Medium,2023
...,...,...,...,...,...,...
2437,Applied Machine Learning Scientist,Mid,423000,United States,Large,2021
2438,Data Specialist,Senior,165000,United States,Large,2021
2439,Data Scientist,Senior,412000,United States,Large,2020
2440,Principal Data Scientist,Mid,151000,United States,Large,2021


In [635]:
# Check the total number of employees working for companies located in the United States
(df_clean['Company Location'] == 'United States').value_counts()

True     2437
False       5
Name: Company Location, dtype: int64

In [636]:
# Check to see how many employees residing in the United States do not work full-time
((df['Employment Type'] != 'Full-Time') & (df['Employee Residence'] == 'United States')).value_counts()

False    3289
True       11
dtype: int64

In [637]:
# Store value counts of each job title
titles_counts = df_clean['Job Title'].value_counts()
titles_counts.head(20)

Data Engineer                     554
Data Scientist                    463
Data Analyst                      355
Machine Learning Engineer         214
Analytics Engineer                108
Research Scientist                 80
Data Architect                     76
Data Science Manager               53
ML Engineer                        48
Applied Scientist                  47
Research Engineer                  46
Machine Learning Scientist         35
Data Manager                       27
Data Analytics Manager             21
Business Intelligence Engineer     20
Data Specialist                    18
BI Developer                       16
Data Science Consultant            16
BI Analyst                         13
Computer Vision Engineer           11
Name: Job Title, dtype: int64

In [638]:
# Choose a cutoff value and create a list of job titles to be replaced
# use the variable name `residence_to_replace`
cutoff_value = 50

titles_to_replace = titles_counts[titles_counts < cutoff_value].index.tolist()

# Replace in dataframe
for jt in titles_to_replace:
    df_clean['Job Title'] = df_clean['Job Title'].replace(jt, "Other")

# Check to make sure binning was successful
df_clean['Job Title'].value_counts()

Data Engineer                554
Other                        539
Data Scientist               463
Data Analyst                 355
Machine Learning Engineer    214
Analytics Engineer           108
Research Scientist            80
Data Architect                76
Data Science Manager          53
Name: Job Title, dtype: int64

In [639]:
# Check to see how many companies are located outside of the United States
location_counts = df_clean['Company Location'].value_counts()
location_counts

United States    2437
Japan               1
Australia           1
Germany             1
Canada              1
France              1
Name: Company Location, dtype: int64

In [640]:
# Choose a cutoff value and create a list of countries of residence to be replaced
# use the variable name `residence_to_replace`
cutoff_value = 1000

location_to_replace = location_counts[location_counts < cutoff_value].index.tolist()

# Replace in dataframe
for loca in location_to_replace:
    df_clean['Company Location'] = df_clean['Company Location'].replace(loca, "Other")

# Check to make sure binning was successful
df_clean['Company Location'].value_counts()

United States    2437
Other               5
Name: Company Location, dtype: int64

## Encoding Qualitative Variables

In [641]:
# Display cleaned dataframe
df_clean

Unnamed: 0,Job Title,Experience Level,Salary,Company Location,Company Size,Year
0,Data Engineer,Senior,210000,United States,Medium,2023
1,Data Engineer,Senior,165000,United States,Medium,2023
2,Data Engineer,Senior,185900,United States,Medium,2023
3,Data Engineer,Senior,129300,United States,Medium,2023
4,Data Scientist,Senior,140000,United States,Medium,2023
...,...,...,...,...,...,...
2437,Other,Mid,423000,United States,Large,2021
2438,Other,Senior,165000,United States,Large,2021
2439,Data Scientist,Senior,412000,United States,Large,2020
2440,Other,Mid,151000,United States,Large,2021


In [642]:
# Display value counts of job titles
df_clean['Job Title'].value_counts()

Data Engineer                554
Other                        539
Data Scientist               463
Data Analyst                 355
Machine Learning Engineer    214
Analytics Engineer           108
Research Scientist            80
Data Architect                76
Data Science Manager          53
Name: Job Title, dtype: int64

In [643]:
# Code job titles into numeric values
df_clean.loc[df_clean['Job Title'] == 'Data Analyst', 'Job Title'] = 0
df_clean.loc[df_clean['Job Title'] == 'Data Engineer', 'Job Title'] = 1
df_clean.loc[df_clean['Job Title'] == 'Analytics Engineer', 'Job Title'] = 2
df_clean.loc[df_clean['Job Title'] == 'Data Scientist', 'Job Title'] = 3
df_clean.loc[df_clean['Job Title'] == 'Other', 'Job Title'] = 4
df_clean.loc[df_clean['Job Title'] == 'Data Architect', 'Job Title'] = 5
df_clean.loc[df_clean['Job Title'] == 'Research Scientist', 'Job Title'] = 6
df_clean.loc[df_clean['Job Title'] == 'Machine Learning Engineer', 'Job Title'] = 7
df_clean.loc[df_clean['Job Title'] == 'Data Science Manager', 'Job Title'] = 8

In [644]:
# Display value counts of experience levels
df_clean['Experience Level'].value_counts()

Senior       1734
Mid           440
Entry         151
Executive     117
Name: Experience Level, dtype: int64

In [645]:
# Code experience levels into numeric values
df_clean.loc[df_clean['Experience Level'] == 'Entry', 'Experience Level'] = 0
df_clean.loc[df_clean['Experience Level'] == 'Mid', 'Experience Level'] = 1
df_clean.loc[df_clean['Experience Level'] == 'Senior', 'Experience Level'] = 2
df_clean.loc[df_clean['Experience Level'] == 'Executive', 'Experience Level'] = 3

In [646]:
# Code company location into numeric values
df_clean.loc[df_clean['Company Location'] == 'United States', 'Company Location'] = 0
df_clean.loc[df_clean['Company Location'] == 'Other', 'Company Location'] = 1

In [647]:
# Display company sizes
df_clean['Company Size'].value_counts()

Medium    2176
Large      230
Small       36
Name: Company Size, dtype: int64

In [648]:
# Code company size into numeric values
df_clean.loc[df_clean['Company Size'] == 'Small', 'Company Size'] = 0
df_clean.loc[df_clean['Company Size'] == 'Medium', 'Company Size'] = 1
df_clean.loc[df_clean['Company Size'] == 'Large', 'Company Size'] = 2

## Linear Regression

In [649]:
# Display cleaned dataframe with encoded variables
df_clean

Unnamed: 0,Job Title,Experience Level,Salary,Company Location,Company Size,Year
0,1,2,210000,0,1,2023
1,1,2,165000,0,1,2023
2,1,2,185900,0,1,2023
3,1,2,129300,0,1,2023
4,3,2,140000,0,1,2023
...,...,...,...,...,...,...
2437,4,1,423000,0,2,2021
2438,4,2,165000,0,2,2021
2439,3,2,412000,0,2,2020
2440,4,1,151000,0,2,2021


In [650]:
# Store the independent variable in y and create a dataframe with the dependent variables
y = df_clean['Salary']
X = df_clean.drop(columns = ['Salary'])

# Display dataframe
X

Unnamed: 0,Job Title,Experience Level,Company Location,Company Size,Year
0,1,2,0,1,2023
1,1,2,0,1,2023
2,1,2,0,1,2023
3,1,2,0,1,2023
4,3,2,0,1,2023
...,...,...,...,...,...
2437,4,1,0,2,2021
2438,4,2,0,2,2021
2439,3,2,0,2,2020
2440,4,1,0,2,2021


In [652]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [653]:
# Fit the multiple linear regression model
mlr_model = linear_model.LinearRegression()
mlr_model.fit(X_train, y_train)
print(mlr_model.coef_)

# Store the predicted 
y_test = y_test.reset_index(drop = True)
y_pred = mlr_model.predict(X_test)

[  7653.95102058  33143.24725875 -27375.4392221    9772.68752532
   7913.18082336]


In [654]:
# Create predictions and residuals dataframe
pred_df = pd.DataFrame(y_pred, columns = ['Predictions'])
pred_df['Testing Data'] = y_test
residuals = y_test - y_pred
pred_df['Residuals'] = residuals
pred_df

Unnamed: 0,Predictions,Testing Data,Residuals
0,179694.215786,140000,-39694.215786
1,87808.672096,90000,2191.327904
2,164127.083942,247500,83372.916058
3,141165.230880,123648,-17517.230880
4,202656.068848,311000,108343.931152
...,...,...,...
606,123589.115465,106500,-17089.115465
607,179694.215786,225000,45305.784214
608,164127.083942,149850,-14277.083942
609,173640.541665,165000,-8640.541665


In [655]:
# Calculate the R^2 value and display residuals vs. fitted plot
print('R-squared =', r2_score(y_test, y_pred))
pred_df.hvplot.scatter(x = 'Predictions', y = 'Residuals', title = 'Residuals vs. Fitted Plot')

R-squared = 0.13578201666963385


The residuals vs. fitted graph using the model with all variables shows heteroscedasticity, meaning the variance of the standard error, or y-intercept, is not constant throughout the model. This is an issue because it invalidates tests of statistical significance. In order to correct this we used stepwise variable selection with Akaike Information Criterion and Bayesian Information Criterion in order to test if our model could be improved through simplifying our model.

## Model Selection

In [656]:
# Calculate AIC and BIC for original model
mse1 = stat.mean(residuals ** 2)
n1 = len(residuals)
k1 = (len(mlr_model.coef_) + 1)
aic1 = (2*k1 + n1*math.log(mse1) + n1*math.log(2*math.pi) + n1)
bic1 = (k1*math.log(n1) + n1*math.log(mse1) + n1*math.log(2*math.pi) + n1)
print('AIC is', aic1)
print('BIC is', bic1)

AIC is 15130.30527232319
BIC is 15156.79585407822


In [657]:
# Set up for AIC and BIC function
mod_col = X.columns.tolist()
col_range = range(len(mod_col))
AIC_list = []
BIC_list = []

In [658]:
# Create function for calculating AIC and BIC
for col in col_range:
    X2 = X.drop(columns = mod_col[col])
    X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, random_state = 1)
    y2_test = y2_test.reset_index(drop = True)
    mlr_model2 = linear_model.LinearRegression()
    mlr_model2.fit(X2_train, y2_train)
    y2_pred = mlr_model2.predict(X2_test)
    residuals2 = y2_test - y2_pred
    mse = stat.mean(residuals2 ** 2)
    n = len(residuals2)
    k = (len(mlr_model2.coef_) + 1)
    aic = (2*k + n*math.log(mse) + n*math.log(2*math.pi) + n)
    bic = (k*math.log(n) + n*math.log(mse) + n*math.log(2*math.pi) + n)
    AIC_list.append(aic)
    BIC_list.append(bic)

In [659]:
# Print AIC and BIC results
# Compare to model with no dropped variables
print(AIC_list)
print(BIC_list)

[15167.177641529357, 15177.986751854707, 15127.257447126394, 15130.30059721027, 15127.617764475222]
[15189.253126325215, 15200.062236650565, 15149.332931922252, 15152.376082006127, 15149.69324927108]


In [700]:
# List variables for selection
vl = [['Job Title', 'Experience Level', 'Company Location', 'Company Size', 'Year', 'None']]

# Create list of AIC scores
AIC_list_display = [None] * 6
AIC_list_display[0] = AIC_list[0]
AIC_list_display[1] = AIC_list[1]
AIC_list_display[2] = AIC_list[2]
AIC_list_display[3] = AIC_list[3]
AIC_list_display[4] = AIC_list[4]
AIC_list_display[5] = aic1

# Create list of BIC scores
BIC_list_display = [None] * 6
BIC_list_display[0] = BIC_list[0]
BIC_list_display[1] = BIC_list[1]
BIC_list_display[2] = BIC_list[2]
BIC_list_display[3] = BIC_list[3]
BIC_list_display[4] = BIC_list[4]
BIC_list_display[5] = bic1

# Create dataframe of AIC anf BIC scores
bsvs = pd.DataFrame(AIC_list_display, columns = ['AIC'])
bsvs['BIC'] = BIC_list_display
bsvs.set_index(vl)

Unnamed: 0,AIC,BIC
Job Title,15167.177642,15189.253126
Experience Level,15177.986752,15200.062237
Company Location,15127.257447,15149.332932
Company Size,15130.300597,15152.376082
Year,15127.617764,15149.693249
,15130.305272,15156.795854


In [590]:
# Display dataframe
X

Unnamed: 0,Job Title,Experience Level,Company Location,Company Size,Year
0,0,2,0,1,2023
1,0,2,0,1,2023
2,0,2,0,1,2023
3,0,2,0,1,2023
4,1,2,0,1,2023
...,...,...,...,...,...
2437,8,1,0,2,2021
2438,8,2,0,2,2021
2439,1,2,0,2,2020
2440,8,1,0,2,2021


In [660]:
# Drop variable with the lowest AIC and BIC
X2 = X.drop(columns = ['Company Location'])
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, random_state = 1)
y2_test = y2_test.reset_index(drop = True)
mlr_model2 = linear_model.LinearRegression()
mlr_model2.fit(X2_train, y2_train)
y2_pred = mlr_model2.predict(X2_test)
residuals2 = y2_test - y2_pred


In [661]:
# Create predictions and residuals dataframe
pred2_df = pd.DataFrame(y2_pred, columns = ['Predictions'])
pred2_df['Testing Data'] = y2_test
pred2_df['Residuals'] = residuals2
pred2_df

Unnamed: 0,Predictions,Testing Data,Residuals
0,179675.702222,140000,-39675.702222
1,88075.908227,90000,1924.091773
2,164077.246304,247500,83422.753696
3,141139.659675,123648,-17491.659675
4,202613.288851,311000,108386.711149
...,...,...,...
606,123579.920084,106500,-17079.920084
607,179675.702222,225000,45324.297778
608,164077.246304,149850,-14227.246304
609,173148.730369,165000,-8148.730369


In [662]:
# Calculate the R^2 value and display residuals vs. fitted plot
print(r2_score(y2_test, y2_pred))
pred2_df.hvplot.scatter(x = 'Predictions', y = 'Residuals', title = 'Residuals vs. Fitted Plot')

0.13726282410505308


In [663]:
# Calculate AIC for second model
mse2 = stat.mean(residuals2 ** 2)
n2 = len(residuals2)
k2 = (len(mlr_model2.coef_) + 1)
aic2 = (2*k2 + n2*math.log(mse2) + n2*math.log(2*math.pi) + n2)
bic2 = (k2*math.log(n2) + n2*math.log(mse2) + n2*math.log(2*math.pi) + n2)
print('AIC is', aic2)
print('BIC is', bic2)

AIC is 15127.257447126394
BIC is 15149.332931922252


In [664]:
# Set up for AIC and BIC function
mod_col2 = X2.columns.tolist()
col_range2 = range(len(mod_col2))
AIC_list2 = []
BIC_list2 = []

In [665]:
# Calculate AIC for possible models
for col in col_range2:
    X3 = X2.drop(columns = mod_col2[col])
    X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, random_state = 1)
    y3_test = y3_test.reset_index(drop = True)
    mlr_model3 = linear_model.LinearRegression()
    mlr_model3.fit(X3_train, y3_train)
    y3_pred = mlr_model3.predict(X3_test)
    residuals3 = y3_test - y3_pred
    mse = stat.mean(residuals3 ** 2)
    n = len(residuals3)
    k = (len(mlr_model3.coef_) + 1)
    aic = (2*k + n*math.log(mse) + n*math.log(2*math.pi) + n)
    bic = (k*math.log(n) + n*math.log(mse) + n*math.log(2*math.pi) + n)
    AIC_list2.append(aic)
    BIC_list2.append(bic)

In [666]:
# Display AIC and BIC scores to determine which variable to drop
print(AIC_list2)
print(BIC_list2)

[15164.300128825485, 15174.581834938754, 15127.417188157877, 15124.633205899421]
[15181.960516662171, 15192.24222277544, 15145.077575994563, 15142.293593736107]


In [667]:
# Display dataframe
X2

Unnamed: 0,Job Title,Experience Level,Company Size,Year
0,1,2,1,2023
1,1,2,1,2023
2,1,2,1,2023
3,1,2,1,2023
4,3,2,1,2023
...,...,...,...,...
2437,4,1,2,2021
2438,4,2,2,2021
2439,3,2,2,2020
2440,4,1,2,2021


In [668]:
# Drop variable with the lowest AIC and BIC
X3 = X2.drop(columns = ['Year'])
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, random_state = 1)
y3_test = y3_test.reset_index(drop = True)
mlr_model3 = linear_model.LinearRegression()
mlr_model3.fit(X3_train, y3_train)
y3_pred = mlr_model3.predict(X3_test)
residuals3 = y3_test - y3_pred

In [669]:
# Create predictions and residuals dataframe
pred3_df = pd.DataFrame(y3_pred, columns = ['Predictions'])
pred3_df['Testing Data'] = y3_test
pred3_df['Residuals'] = residuals3
pred3_df

Unnamed: 0,Predictions,Testing Data,Residuals
0,177079.921097,140000,-37079.921097
1,103853.070556,90000,-13853.070556
2,169346.478579,247500,78153.521421
3,146146.151024,123648,-22498.151024
4,200280.248651,311000,110719.751349
...,...,...,...
606,119874.265036,106500,-13374.265036
607,177079.921097,225000,47920.078903
608,169346.478579,149850,-19496.478579
609,182296.114624,165000,-17296.114624


In [670]:
# Calculate the R^2 value and display residuals vs. fitted plot
print(r2_score(y3_test, y3_pred))
pred3_df.hvplot.scatter(x = 'Predictions', y = 'Residuals', title = 'Residuals vs. Fitted Plot')

0.13814380789100278


In [671]:
# Calculate AIC for third model
mse3 = stat.mean(residuals3 ** 2)
n3 = len(residuals3)
k3 = (len(mlr_model3.coef_) + 1)
aic3 = (2*k3 + n3*math.log(mse3) + n3*math.log(2*math.pi) + n3)
bic3 = (k3*math.log(n3) + n3*math.log(mse3) + n3*math.log(2*math.pi) + n3)
print('AIC is', aic3)
print('BIC is', bic3)

AIC is 15124.633205899421
BIC is 15142.293593736107


In [672]:
# Set up for AIC and BIC function
mod_col3 = X3.columns.tolist()
col_range3 = range(len(mod_col3))
AIC_list3 = []
BIC_list3 = []

In [673]:
# Calculate AIC and BIC for possible models
for col in col_range3:
    X4 = X3.drop(columns = mod_col3[col])
    X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y, random_state = 1)
    y4_test = y4_test.reset_index(drop = True)
    mlr_model4 = linear_model.LinearRegression()
    mlr_model4.fit(X4_train, y4_train)
    y4_pred = mlr_model4.predict(X4_test)
    residuals4 = y4_test - y4_pred
    mse = stat.mean(residuals4 ** 2)
    n = len(residuals4)
    k = (len(mlr_model4.coef_) + 1)
    aic = (2*k + n*math.log(mse) + n*math.log(2*math.pi) + n)
    bic = (k*math.log(n) + n*math.log(mse) + n*math.log(2*math.pi) + n)
    AIC_list3.append(aic)
    BIC_list3.append(bic)

In [674]:
# Display AIC and BIC scores to determine which variable to drop
print(AIC_list3)
print(BIC_list3)

[15161.408936102831, 15174.622071123327, 15124.011282184994]
[15174.654226980345, 15187.867362000841, 15137.256573062508]


In [675]:
# Display dataframe
X3

Unnamed: 0,Job Title,Experience Level,Company Size
0,1,2,1
1,1,2,1
2,1,2,1
3,1,2,1
4,3,2,1
...,...,...,...
2437,4,1,2
2438,4,2,2
2439,3,2,2
2440,4,1,2


In [676]:
# Drop variable with the lowest AIC and BIC
X4 = X3.drop(columns = ['Company Size'])
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y, random_state = 1)
y4_test = y4_test.reset_index(drop = True)
mlr_model4 = linear_model.LinearRegression()
mlr_model4.fit(X4_train, y4_train)
y4_pred = mlr_model4.predict(X4_test)
residuals4 = y4_test - y4_pred

In [677]:
# Create predictions and residuals dataframe
pred4_df = pd.DataFrame(y4_pred, columns = ['Predictions'])
pred4_df['Testing Data'] = y4_test
pred4_df['Residuals'] = residuals4
pred4_df

Unnamed: 0,Predictions,Testing Data,Residuals
0,177465.857768,140000,-37465.857768
1,109844.970890,90000,-19844.970890
2,169697.328555,247500,77802.671445
3,146391.740913,123648,-22743.740913
4,200771.445410,311000,110228.554590
...,...,...,...
606,120349.826688,106500,-13849.826688
607,177465.857768,225000,47534.142232
608,169697.328555,149850,-19847.328555
609,177465.857768,165000,-12465.857768


In [678]:
# Calculate the R^2 value and display residuals vs. fitted plot
print('R-squared =', r2_score(y4_test, y4_pred))
pred4_df.hvplot.scatter(x = 'Predictions', y = 'Residuals', title = 'Residuals vs. Fitted Plot')

R-squared = 0.13619774571927246


In [679]:
# Calculate AIC and BIC for fourth model
mse4 = stat.mean(residuals4 ** 2)
n4 = len(residuals4)
k4 = (len(mlr_model4.coef_) + 1)
aic4 = (2*k4 + n4*math.log(mse4) + n4*math.log(2*math.pi) + n4)
bic4 = (k4*math.log(n4) + n4*math.log(mse4) + n4*math.log(2*math.pi) + n4)
print('AIC is', aic4)
print('BIC is', bic4)

AIC is 15124.011282184994
BIC is 15137.256573062508


In [684]:
# Set up for AIC and BIC function
mod_col4 = X4.columns.tolist()
col_range4 = range(len(mod_col4))
AIC_list4 = []
BIC_list4 = []

In [685]:
# Calculate AIC and BIC for possible models
for col in col_range4:
    X5 = X4.drop(columns = mod_col4[col])
    X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y, random_state = 1)
    y5_test = y5_test.reset_index(drop = True)
    mlr_model5 = linear_model.LinearRegression()
    mlr_model5.fit(X5_train, y5_train)
    y5_pred = mlr_model5.predict(X5_test)
    residuals5 = y5_test - y5_pred
    mse = stat.mean(residuals5 ** 2)
    n = len(residuals5)
    k = (len(mlr_model5.coef_) + 1)
    aic = (2*k + n*math.log(mse) + n*math.log(2*math.pi) + n)
    bic = (k*math.log(n) + n*math.log(mse) + n*math.log(2*math.pi) + n)
    AIC_list4.append(aic)
    BIC_list4.append(bic)

In [686]:
# Display AIC and BIC scores to determine which variable to drop
print(AIC_list4)
print(BIC_list4)

[15161.171598345509, 15172.546820945778]
[15170.001792263853, 15181.377014864122]


In [691]:
# Display dataframe
X4

Unnamed: 0,Job Title,Experience Level
0,1,2
1,1,2
2,1,2
3,1,2
4,3,2
...,...,...
2437,4,1
2438,4,2
2439,3,2
2440,4,1


In [698]:
# List variables for selection
vl2 = [['Job Title', 'Experience Level', 'None']]

# Create list of AIC scores
AIC_list_display2 = [None] * 3
AIC_list_display2[0] = AIC_list4[0]
AIC_list_display2[1] = AIC_list4[1]
AIC_list_display2[2] = aic4

# Create list of BIC scores
BIC_list_display2 = [None] * 3
BIC_list_display2[0] = BIC_list4[0]
BIC_list_display2[1] = BIC_list4[1]
BIC_list_display2[2] = bic4

# Create dataframe of AIC anf BIC scores
bsvs2 = pd.DataFrame(AIC_list_display2, columns = ['AIC'])
bsvs2['BIC'] = BIC_list_display2
bsvs2.set_index(vl2)

Unnamed: 0,AIC,BIC
Job Title,15161.171598,15170.001792
Experience Level,15172.546821,15181.377015
,15124.011282,15137.256573
