# Problem Statement :

## The following data contains Salaries of University of Vermont (UVM) faculty from 2009 to 2021. Create a model to predict the future 'Base Pay' of the faculty within the University.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score

# Models
from sklearn.linear_model import LinearRegression, Ridge

In [2]:
df = pd.read_csv('university_salaries.csv')
df.head()

Unnamed: 0,Year,Name,Primary Job Title,Base Pay,Department,College
0,2010,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,"Abaied, Jamie L.",Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,"Abaied, Jamie L.",Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,"Abaied, Jamie L.",Assistant Professor,68658.0,Department of Psychological Science,CAS


In [3]:
df.shape

(14470, 6)

## Part 1

## Without using sklearn Pipeline

In [4]:
data = df.copy()

## Preprocessing

In [5]:
data.isnull().sum()

Year                 0
Name                 0
Primary Job Title    0
Base Pay             0
Department           0
College              0
dtype: int64

### No null values present within the dataset

### 'Name' is a unique identifier, hence we can drop this feature

In [6]:
data.drop(['Name'], axis=1, inplace=True)

In [7]:
df1 = data.copy()
df1.head()

Unnamed: 0,Year,Primary Job Title,Base Pay,Department,College
0,2010,Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,Assistant Professor,68658.0,Department of Psychological Science,CAS


## Encoding

In [8]:
pjt_dummies = pd.get_dummies(df1['Primary Job Title'], drop_first=True)
dept_dummies = pd.get_dummies(df1['Department'], drop_first=True)
col_dummies = pd.get_dummies(df1['College'], drop_first=True)

In [9]:
df1 = pd.concat([df1, pjt_dummies, dept_dummies, col_dummies], axis=1)

In [10]:
df1.drop(['Primary Job Title', 'Department', 'College'], axis=1, inplace=True)

In [11]:
df1.head()

Unnamed: 0,Year,Base Pay,Academic Srvcs Professional Sr,Academic Srvcs Professonal Sr,Acting Director,Acting Director Dana Medical Library,Admin Analyst/Planner,Admin Analyst/Planner Sr,Admin Facilities Professnl,Admin Leave - Former Administrator,...,CAS,CEMS,CESS,CNHS,COM,Department of Ext,LCOMEO,Learning and Info Tech,Library,RSENR
0,2010,64000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2011,64000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2012,65229.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2013,66969.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2014,68658.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
df1.dtypes

Year                                int64
Base Pay                          float64
Academic Srvcs Professional Sr      uint8
Academic Srvcs Professonal Sr       uint8
Acting Director                     uint8
                                   ...   
Department of Ext                   uint8
LCOMEO                              uint8
Learning and Info Tech              uint8
Library                             uint8
RSENR                               uint8
Length: 267, dtype: object

## Train - Test Split

In [13]:
X = df1.drop(['Base Pay'], axis=1)
X.head()

Unnamed: 0,Year,Academic Srvcs Professional Sr,Academic Srvcs Professonal Sr,Acting Director,Acting Director Dana Medical Library,Admin Analyst/Planner,Admin Analyst/Planner Sr,Admin Facilities Professnl,Admin Leave - Former Administrator,Admin Leave - Professor,...,CAS,CEMS,CESS,CNHS,COM,Department of Ext,LCOMEO,Learning and Info Tech,Library,RSENR
0,2010,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2011,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2012,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2013,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2014,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [14]:
y = df1['Base Pay']
y.head()

0    64000.0
1    64000.0
2    65229.0
3    66969.0
4    68658.0
Name: Base Pay, dtype: float64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Feature Scaling : Standardizing

In [16]:
sc = StandardScaler()

In [17]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [18]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_train.head()

Unnamed: 0,Year,Academic Srvcs Professional Sr,Academic Srvcs Professonal Sr,Acting Director,Acting Director Dana Medical Library,Admin Analyst/Planner,Admin Analyst/Planner Sr,Admin Facilities Professnl,Admin Leave - Former Administrator,Admin Leave - Professor,...,CAS,CEMS,CESS,CNHS,COM,Department of Ext,LCOMEO,Learning and Info Tech,Library,RSENR
0,-1.283773,-0.014365,-0.026882,-0.014365,0.0,-0.014365,-0.026882,-0.020317,0.0,-0.010157,...,-0.583147,-0.255489,4.1833,-0.208267,-0.937769,-0.071277,-0.033705,-0.026882,-0.108114,-0.170229
1,1.369346,-0.014365,-0.026882,-0.014365,0.0,-0.014365,-0.026882,-0.020317,0.0,-0.010157,...,1.714832,-0.255489,-0.239046,-0.208267,-0.937769,-0.071277,-0.033705,-0.026882,-0.108114,-0.170229
2,-0.104609,-0.014365,-0.026882,-0.014365,0.0,-0.014365,-0.026882,-0.020317,0.0,-0.010157,...,1.714832,-0.255489,-0.239046,-0.208267,-0.937769,-0.071277,-0.033705,-0.026882,-0.108114,-0.170229
3,-0.988982,-0.014365,-0.026882,-0.014365,0.0,-0.014365,-0.026882,-0.020317,0.0,-0.010157,...,-0.583147,-0.255489,-0.239046,-0.208267,1.066361,-0.071277,-0.033705,-0.026882,-0.108114,-0.170229
4,1.369346,-0.014365,-0.026882,-0.014365,0.0,-0.014365,-0.026882,-0.020317,0.0,-0.010157,...,-0.583147,-0.255489,-0.239046,4.80152,-0.937769,-0.071277,-0.033705,-0.026882,-0.108114,-0.170229


In [19]:
X_test = pd.DataFrame(X_test, columns=X.columns)
X_test.head()

Unnamed: 0,Year,Academic Srvcs Professional Sr,Academic Srvcs Professonal Sr,Acting Director,Acting Director Dana Medical Library,Admin Analyst/Planner,Admin Analyst/Planner Sr,Admin Facilities Professnl,Admin Leave - Former Administrator,Admin Leave - Professor,...,CAS,CEMS,CESS,CNHS,COM,Department of Ext,LCOMEO,Learning and Info Tech,Library,RSENR
0,0.768282,0.0,-0.020468,0.0,-0.014471,0.0,-0.014471,0.0,-0.020468,0.0,...,-0.589601,-0.247179,-0.233034,4.808452,-0.919091,-0.07254,-0.014471,-0.025071,-0.111839,-0.188572
1,-0.731111,0.0,-0.020468,0.0,-0.014471,0.0,-0.014471,0.0,-0.020468,0.0,...,-0.589601,-0.247179,-0.233034,-0.207967,1.088032,-0.07254,-0.014471,-0.025071,-0.111839,-0.188572
2,-0.131354,0.0,-0.020468,0.0,-0.014471,0.0,-0.014471,0.0,-0.020468,0.0,...,-0.589601,-0.247179,-0.233034,-0.207967,1.088032,-0.07254,-0.014471,-0.025071,-0.111839,-0.188572
3,1.06816,0.0,-0.020468,0.0,-0.014471,0.0,-0.014471,0.0,-0.020468,0.0,...,-0.589601,4.045649,-0.233034,-0.207967,-0.919091,-0.07254,-0.014471,-0.025071,-0.111839,-0.188572
4,-0.431232,0.0,-0.020468,0.0,-0.014471,0.0,-0.014471,0.0,-0.020468,0.0,...,-0.589601,-0.247179,-0.233034,-0.207967,1.088032,-0.07254,-0.014471,-0.025071,-0.111839,-0.188572


## Training

## Linear Regression

In [20]:
lr = LinearRegression()

In [21]:
lr.fit(X_train, y_train)

LinearRegression()

In [22]:
lr.score(X_test, y_test)

-2.8405926914640654e+26

In [23]:
y_pred = lr.predict(X_test)

In [24]:
from sklearn.metrics import mean_squared_error

In [25]:
mean_squared_error(y_test, y_pred)

6.3163164804034216e+35

## Linear Regression using Ridge Regularizer

In [26]:
ridge = Ridge()

## Hyperparameter Tuning

In [27]:
params = {
    'alpha' : [1.0, 1.3, 1.5, 2.0, 2.5]
}

In [28]:
grid = GridSearchCV(ridge, param_grid=params, n_jobs=1, cv=5)

In [29]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(), n_jobs=1,
             param_grid={'alpha': [1.0, 1.3, 1.5, 2.0, 2.5]})

In [30]:
grid.best_estimator_

Ridge(alpha=2.5)

In [31]:
# The model will be trained with the best value of alpha which is 2.5
grid.score(X_test, y_test)

0.6270202937526

## Summary

### It is clear that using Ridge Regularizer with Linear Regression model gives us dramatically better results than using just a Linear Regression model

-------------------------------------------------------------------------------------------------------------------------------

## Part 2

## Using sklearn Pipeline

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [33]:
df2 = df.copy()
df2.head()

Unnamed: 0,Year,Name,Primary Job Title,Base Pay,Department,College
0,2010,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,"Abaied, Jamie L.",Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,"Abaied, Jamie L.",Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,"Abaied, Jamie L.",Assistant Professor,68658.0,Department of Psychological Science,CAS


In [34]:
df2.drop(['Name'], axis=1, inplace=True)

In [35]:
df2.head()

Unnamed: 0,Year,Primary Job Title,Base Pay,Department,College
0,2010,Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,Assistant Professor,68658.0,Department of Psychological Science,CAS


In [36]:
X2 = df2.copy()
X2 = df2.drop(['Base Pay'], axis=1)

In [37]:
X2.head()

Unnamed: 0,Year,Primary Job Title,Department,College
0,2010,Assistant Professor,Department of Psychological Science,CAS
1,2011,Assistant Professor,Department of Psychological Science,CAS
2,2012,Assistant Professor,Department of Psychological Science,CAS
3,2013,Assistant Professor,Department of Psychological Science,CAS
4,2014,Assistant Professor,Department of Psychological Science,CAS


In [38]:
y = df2['Base Pay']
y.head()

0    64000.0
1    64000.0
2    65229.0
3    66969.0
4    68658.0
Name: Base Pay, dtype: float64

### Creating Pipelines

In [39]:
cont_features = ['Year']
cat_features = ['Primary Job Title', 'Department', 'College']

In [40]:
# Pipeline for continuous features

cont_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Pipeline for categorical features

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
]) # Using drop = 'first' threw error because of handle_unknown='ignore'

In [41]:
# Final Preprocessed Pipeline

preprocess_pipeline = ColumnTransformer([
    ('continuous', cont_pipeline, cont_features),
    ('categorical', cat_pipeline, cat_features)
], remainder='passthrough')

In [42]:
X_preprocessed = preprocess_pipeline.fit_transform(X2)

In [43]:
X_preprocessed

array([[-1.59543219,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.29901064,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.00258908,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.77594028,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.07236184,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.3687834 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.33, random_state=0)

In [45]:
ridge2 = Ridge()

In [46]:
params = {
    'alpha' : [1.0, 1.3, 1.5, 2.0, 2.5]
}

In [47]:
grid2 = GridSearchCV(ridge2, param_grid=params, n_jobs=1, cv=5)

In [48]:
grid2.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(), n_jobs=1,
             param_grid={'alpha': [1.0, 1.3, 1.5, 2.0, 2.5]})

In [49]:
# The model will be trained with the best value of alpha which is 2.5
grid2.score(X_test, y_test)

0.5984523699292518

In [50]:
grid2.best_params_

{'alpha': 1.0}

In [51]:
score = cross_val_score(grid2, X_preprocessed, y, cv=10)

In [52]:
score

array([0.55456138, 0.62059348, 0.57427439, 0.50679336, 0.44208033,
       0.45081681, 0.4780654 , 0.50915747, 0.48290192, 0.55466507])

In [53]:
score.mean()

0.5173909609226672