In [None]:
'''Problem Statement: Predict the salary if the employee using the features provided in the data.'''

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import metrics
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset3/main/Salaries.csv')
data.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


## Lables
- Rank : Rank of the Professor(Assistant Professor, Associate Professor, Professor).
- Discipline : Discipline of Professor.
- yrs.since.phd : Years since PHD.
- yrs.service : Years in Service.
- Sex : Gender of professor.

## Target
- Salary : Salary of professor based on the labels.

In [3]:
data.shape

(397, 6)

Shape of the dataset.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   rank           397 non-null    object
 1   discipline     397 non-null    object
 2   yrs.since.phd  397 non-null    int64 
 3   yrs.service    397 non-null    int64 
 4   sex            397 non-null    object
 5   salary         397 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 18.7+ KB


Indformation of Dataset.
- Range index : 0 to 396
- Total columns : 6
- Non null value : no null
- dtypes : int(3),object(3)

In [5]:
data.isna().sum()

rank             0
discipline       0
yrs.since.phd    0
yrs.service      0
sex              0
salary           0
dtype: int64

No null values present in any column

In [6]:
data.describe()

Unnamed: 0,yrs.since.phd,yrs.service,salary
count,397.0,397.0,397.0
mean,22.314861,17.61461,113706.458438
std,12.887003,13.006024,30289.038695
min,1.0,0.0,57800.0
25%,12.0,7.0,91000.0
50%,21.0,16.0,107300.0
75%,32.0,27.0,134185.0
max,56.0,60.0,231545.0


Description of the dataset.

In [7]:
print(data['rank'].unique())
print(data['discipline'].unique())
print(data['sex'].unique())

['Prof' 'AsstProf' 'AssocProf']
['B' 'A']
['Male' 'Female']


Unique elements in particular columns.

- Using label encoder for encoding the object values.

In [8]:
lb = LabelEncoder()

In [9]:
df = lb.fit_transform(data['rank'])
data['rank'] = df
data['rank'].unique()

array([2, 1, 0])

In [10]:
dfd = lb.fit_transform(data['discipline'])
data['discipline'] = dfd
data['discipline'].unique()

array([1, 0])

In [11]:
dfs = lb.fit_transform(data['sex'])
data['sex'] = dfs
data['sex'].unique()

array([1, 0])

In [12]:
data.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,2,1,19,18,1,139750
1,2,1,20,16,1,173200
2,1,1,4,3,1,79750
3,2,1,45,39,1,115000
4,2,1,40,41,1,141500


In [13]:
x = data.drop('salary',axis = 1)
y = data.salary

Seperation labels and target

In [16]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[ 0.64925739,  0.91540317, -0.25754973,  0.02966908,  0.3300584 ],
       [ 0.64925739,  0.91540317, -0.17985426, -0.12429986,  0.3300584 ],
       [-0.67256406,  0.91540317, -1.42298184, -1.12509795,  0.3300584 ],
       ...,
       [ 0.64925739, -1.09241483,  1.52944617,  0.56856036,  0.3300584 ],
       [ 0.64925739, -1.09241483,  0.20862311, -0.20128433,  0.3300584 ],
       [-0.67256406, -1.09241483, -1.11219995, -1.04811348,  0.3300584 ]])

Applying Standard Scaler on labels to scaled the values present.

## Training the Model

**- Train Test Split**

In [17]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled, y, test_size = 0.20, random_state = 30)

**- Creating The function for the training and testing score, so that, we can get the results just nby calling the function**

In [18]:
def score(reg, x_train, x_test, y_train, y_test, train = True):
    if train:
        y_pred = reg.predict(x_train)
        print('\n ----- Train Result ----- \n')
        print('R2 Score:', metrics.r2_score(y_train,y_pred))
    
    elif train == False:
        pred = reg.predict(x_test)
        print('\n ----- Test Result ----- \n')
        print('R2 Score:', metrics.r2_score(y_test,pred))

**- Model Instantiating**

In [30]:
ada = AdaBoostRegressor()
rf = RandomForestRegressor()

**- Model Trainning For AdaBoost**

In [20]:
ada.fit(x_train,y_train)
score(ada, x_train,x_test,y_train,y_test,train = True)
score(ada, x_train,x_test,y_train,y_test,train = False)


 ----- Train Result ----- 

R2 Score: 0.4985370397779263

 ----- Test Result ----- 

R2 Score: 0.3678456363063437


Score for Adaboost.

**- Model Trainning For Random Forest**

In [21]:
rf.fit(x_train,y_train)
score(rf, x_train,x_test,y_train,y_test,train = True)
score(rf, x_train,x_test,y_train,y_test,train = False)


 ----- Train Result ----- 

R2 Score: 0.8656506758366306

 ----- Test Result ----- 

R2 Score: 0.4231849705628815


Score of Random Forest

In [22]:
data.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,2,1,19,18,1,139750
1,2,1,20,16,1,173200
2,1,1,4,3,1,79750
3,2,1,45,39,1,115000
4,2,1,40,41,1,141500


In [23]:
x.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex
0,2,1,19,18,1
1,2,1,20,16,1
2,1,1,4,3,1
3,2,1,45,39,1
4,2,1,40,41,1


In [25]:
print('Score : ', rf.predict(scaler.transform([[2,1,20,16,1]])))

Score :  [158349.03083333]


**As the Random Forest is giving the better result, so will go with Random Forest Regressor**

**- Hyperparameter Tuning**

In [26]:
param = {'n_estimators': range(1,10),
         'max_depth':range(1,10),
         'max_leaf_nodes' : [2,4,6],
        }

In [42]:
grid = GridSearchCV(estimator = rf ,param_grid = param )

In [36]:
grid.fit(x_train,y_train)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': range(1, 10), 'max_leaf_nodes': [2, 4, 6],
                         'n_estimators': range(1, 100)})

In [37]:
best = grid.best_params_
best

{'max_depth': 3, 'max_leaf_nodes': 6, 'n_estimators': 37}

In [38]:
rfh = RandomForestRegressor(max_depth = 3, max_leaf_nodes = 6, n_estimators = 37)

**- Post tunning Result**

In [39]:
rfh.fit(x_train,y_train)
score(rfh, x_train,x_test,y_train,y_test,train = True)
score(rfh, x_train,x_test,y_train,y_test,train = False)


 ----- Train Result ----- 

R2 Score: 0.49233071380089555

 ----- Test Result ----- 

R2 Score: 0.45430665984908014


Post tunning result of Random Forest Regressor