In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('Pred_Data.xlsx')

In [3]:
df.shape

(500, 7)

In [4]:
df.isnull().sum()

Degree             0
GPA                0
Experience         0
Job Location       0
Salary Hike        0
Change in Level    0
Joined             0
dtype: int64

In [None]:
df.drop_duplicates()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Degree           500 non-null    int64  
 1   GPA              500 non-null    float64
 2   Experience       500 non-null    int64  
 3   Job Location     500 non-null    int64  
 4   Salary Hike      500 non-null    int64  
 5   Change in Level  500 non-null    int64  
 6   Joined           500 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 27.5 KB


In [6]:
X = df.drop('Joined ',1)
y = df[['Joined ']]

### Train Test Split

In [7]:
xtr,xte,ytr,yte = train_test_split(X,y,test_size=.20,random_state=30)

### Data Preprocessing

In [8]:
# X Train

In [9]:
ohe = OneHotEncoder(drop = 'first')
cat_x = pd.DataFrame(ohe.fit_transform(np.array(xtr[['Degree','Job Location ']])).toarray())
cat_x.columns = ['Deg_1','Deg_2','Deg_3','Loc_1','Loc2']

In [10]:
cat_x = cat_x.join((xtr[['Change in Level']].reset_index()).drop('index',1))

In [11]:
sc = StandardScaler()
num_x = pd.DataFrame(sc.fit_transform(xtr[['GPA','Experience','Salary Hike']]))

In [12]:
num_x.columns = ['GPA','Experience','Salary Hike']

In [13]:
trans_x = num_x.join(cat_x)

In [14]:
# X Test

In [15]:
te_cat_x = pd.DataFrame(ohe.transform(np.array(xte[['Degree','Job Location ']])).toarray())
te_cat_x.columns = ['Deg_1','Deg_2','Deg_3','Loc_1','Loc2']
te_cat_x = te_cat_x.join((xte[['Change in Level']].reset_index()).drop('index',1))

In [16]:
te_num_x = pd.DataFrame(sc.transform(xte[['GPA','Experience','Salary Hike']]))
te_num_x.columns = ['GPA','Experience','Salary Hike']

In [17]:
te_x = te_num_x.join(te_cat_x)

### Applying Classification Models

In [18]:
# Logistic Regression

In [19]:
lr = LogisticRegression()
cv_lr = cross_val_score(lr,trans_x,ytr,scoring='roc_auc',
    cv=5)

In [20]:
bias_lr = np.mean(cv_lr)
var_lr = np.std(cv_lr)

In [21]:
bias_lr,var_lr

(0.5328947779862414, 0.04093490909877808)

In [22]:
# KNN

In [23]:
knn = KNeighborsClassifier()
cv_knn = cross_val_score(knn,trans_x,ytr,scoring='roc_auc',
    cv=5)

In [24]:
bias_knn = np.mean(cv_knn)
var_knn = np.std(cv_knn)

In [25]:
bias_knn,var_knn

(0.5319089274546591, 0.07580142255948209)

In [26]:
# Random Forest

In [27]:
rf = RandomForestClassifier(random_state = 30)
cv_rf = cross_val_score(rf,trans_x,ytr,scoring='roc_auc',
    cv=5)

In [28]:
bias_rf = np.mean(cv_rf)
var_rf = np.std(cv_rf)

In [29]:
bias_rf, var_rf

(0.57116647123202, 0.04873269758201851)

In [30]:
# Gradient Boosting

In [31]:
gb = GradientBoostingClassifier(random_state = 30)
cv_gb = cross_val_score(gb,trans_x,ytr,scoring='roc_auc',
    cv=5)

In [32]:
bias_gb = np.mean(cv_gb)
var_gb = np.std(cv_gb)

In [33]:
bias_gb, var_gb

(0.5178787523452157, 0.05254498044455308)

In [34]:
# Tuned Random Forest using GridSearch

In [35]:
params = { 'criterion':['entropy','gini'],'max_depth':np.arange(1,50)}
gv_rf = GridSearchCV(rf,param_grid=params, scoring='roc_auc')
gv_rf.fit(trans_x,ytr)

GridSearchCV(estimator=RandomForestClassifier(random_state=30),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
             scoring='roc_auc')

In [36]:
gv_rf.best_params_

{'criterion': 'entropy', 'max_depth': 12}

In [37]:
tuned_rf = RandomForestClassifier(criterion=  'entropy', max_depth=12, random_state = 30)
cv_tuned_rf = cross_val_score(tuned_rf,trans_x,ytr,scoring='roc_auc',
    cv=5)

In [38]:
bias_tuned_rf = np.mean(cv_tuned_rf)
var_tuned_rf = np.std(cv_tuned_rf)

In [39]:
bias_tuned_rf, var_tuned_rf

(0.5829238586616635, 0.036919990040177104)

In [40]:
# Fitting Tuned Random Forest to the Data

In [41]:
tuned_rf.fit(trans_x,ytr)

RandomForestClassifier(criterion='entropy', max_depth=12, random_state=30)

In [42]:
pred = tuned_rf.predict(te_x)

In [43]:
confusion_matrix(yte,pred)

array([[27, 27],
       [20, 26]], dtype=int64)

In [44]:
precision_score(yte,pred)

0.49056603773584906

In [45]:
recall_score(yte,pred)

0.5652173913043478

In [46]:
print(classification_report(yte,pred))

              precision    recall  f1-score   support

           0       0.57      0.50      0.53        54
           1       0.49      0.57      0.53        46

    accuracy                           0.53       100
   macro avg       0.53      0.53      0.53       100
weighted avg       0.54      0.53      0.53       100



In [None]:
# Applying Prediction on Actual Data

In [96]:
min_max = df.describe().loc[['min','max'],:]
min_max.drop(['GPA','Joined '],1,inplace = True)
selected_df = pd.DataFrame()
def Data(x):
    for y in min_max.columns:
        nums = np.random.randint(min_max[y].loc['min'],min_max[y].loc['max']+1,x)
        selected_df[y] = nums
    selected_df['GPA'] = np.random.randint(171,400,x)/100
    return selected_df

In [97]:
df_actual = Data(20)
df_actual = z.reindex(columns = ['Degree', 'GPA', 'Experience', 'Job Location ', 'Salary Hike',
       'Change in Level'])

In [99]:
te_cat_x = pd.DataFrame(ohe.transform(np.array(df_actual[['Degree','Job Location ']])).toarray())
te_cat_x.columns = ['Deg_1','Deg_2','Deg_3','Loc_1','Loc2']
te_cat_x = te_cat_x.join((df_actual[['Change in Level']].reset_index()).drop('index',1))

In [101]:
te_num_x = pd.DataFrame(sc.transform(df_actual[['GPA','Experience','Salary Hike']]))
te_num_x.columns = ['GPA','Experience','Salary Hike']

In [102]:
te_x = te_num_x.join(te_cat_x)

In [104]:
pred = tuned_rf.predict(te_x)

In [None]:
# Final Predictions

In [105]:
pred

array([0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1],
      dtype=int64)