In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score,confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler #handling feature scaling
from sklearn.preprocessing import OneHotEncoder

In [3]:
#load the dataset
df=pd.read_csv("D:/campus_placement/notebooks/data/placement.csv")

In [4]:
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [5]:
df.shape

(215, 15)

In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()


sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sl_no,215.0,108.0,62.209324,1.0,54.5,108.0,161.5,215.0
ssc_p,215.0,67.303395,10.827205,40.89,60.6,67.0,75.7,89.4
hsc_p,215.0,66.333163,10.897509,37.0,60.9,65.0,73.0,97.7
degree_p,215.0,66.370186,7.358743,50.0,61.0,66.0,72.0,91.0
etest_p,215.0,72.100558,13.275956,50.0,60.0,71.0,83.5,98.0
mba_p,215.0,62.278186,5.833385,51.21,57.945,62.0,66.255,77.89
salary,148.0,288655.405405,93457.45242,200000.0,240000.0,265000.0,300000.0,940000.0


In [10]:
df= df.drop(['sl_no','salary'],axis=1)

In [30]:
df['status'].unique()

array(['Placed', 'Not Placed'], dtype=object)

In [31]:
df['status']= df['status'].map({'Placed':1, 'Not Placed': 0})

In [32]:
df

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,1
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,1
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,1
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,0
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,1
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,1
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,1
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,1


In [33]:
X=df.drop(labels=["status"],axis=1)

In [34]:
X

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50
...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23


In [35]:
y=df[["status"]]

In [36]:
y

Unnamed: 0,status
0,1
1,1
2,1
3,0
4,1
...,...
210,1
211,1
212,1
213,1


In [37]:
numerical_columns = X.select_dtypes(exclude=["object"]).columns


In [38]:
numerical_columns

Index(['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'], dtype='object')

In [39]:
categorical_columns = X.select_dtypes(include="object").columns


In [40]:
categorical_columns

Index(['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex',
       'specialisation'],
      dtype='object')

In [41]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'ssc_b' variable:     ",end=" " )
print(df['ssc_b'].unique())

print("Categories in 'hsc_b' variable:  ",end=" ")
print(df['hsc_b'].unique())

print("Categories in'hsc_s' variable:",end=" " )
print(df['hsc_s'].unique())

print("Categories in  'degree_t' variable:     ",end=" " )
print(df[ 'degree_t'].unique())

print("Categories in 'workex' variable:     ",end=" " )
print(df['workex'].unique())

print("Categories in 'specialisation' variable:     ",end=" " )
print(df['specialisation'].unique())



Categories in 'gender' variable:      ['M' 'F']
Categories in 'ssc_b' variable:      ['Others' 'Central']
Categories in 'hsc_b' variable:   ['Others' 'Central']
Categories in'hsc_s' variable: ['Commerce' 'Science' 'Arts']
Categories in  'degree_t' variable:      ['Sci&Tech' 'Comm&Mgmt' 'Others']
Categories in 'workex' variable:      ['No' 'Yes']
Categories in 'specialisation' variable:      ['Mkt&HR' 'Mkt&Fin']


In [42]:
gender_categories= ['M' 'F']
ssc_b_categories= ['Others' 'Central']
hsc_b_categories= ['Others' 'Central']
hsc_s_categories= ['Commerce' 'Science' 'Arts']
degree_t_categories=['Sci&Tech' 'Comm&Mgmt' 'Others']
workex_categories= ['No' 'Yes']
specialisation_categories= ['Mkt&HR' 'Mkt&Fin']


In [43]:
from sklearn.impute import SimpleImputer   ##handling missing values
from sklearn.preprocessing import StandardScaler #handling feature scaling
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer

In [44]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_columns),
         ("StandardScaler", numeric_transformer, numerical_columns),        
    ]
)

In [45]:
X = preprocessor.fit_transform(X)

In [46]:
X

array([[ 0.        ,  1.        ,  0.        , ..., -1.14010225,
        -1.29109087, -0.59764672],
       [ 0.        ,  1.        ,  1.        , ...,  1.51326671,
         1.08715679,  0.6876202 ],
       [ 0.        ,  1.        ,  1.        , ..., -0.32284282,
         0.21890765, -0.76947385],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.90304633,
        -0.98909117,  1.27870553],
       [ 1.        ,  0.        ,  0.        , ..., -1.14010225,
        -0.15859198, -0.35193393],
       [ 0.        ,  1.        ,  1.        , ..., -1.82115177,
         1.27590661, -0.3536522 ]])

In [47]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((172, 21), (43, 21))

In [48]:
def evaluate_model(true, predicted):
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    f1 = f1_score(true, predicted) # Calculate F1-score
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted)  # Calculate Recall
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, f1 , precision, recall, roc_auc

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score,confusion_matrix
models = {
     "Logistic Regression": LogisticRegression(),
    "RandomForestClassifier":RandomForestClassifier(n_estimators =14, criterion = 'entropy', random_state = 41)
}
models_list = []
accuracy_list = []
  
for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accuracy, model_train_f1,model_train_precision,\
        model_train_recall,model_train_rocauc_score=evaluate_model(y_train ,y_train_pred)
        
        # Test set performance
        model_test_accuracy,model_test_f1,model_test_precision,\
        model_test_recall,model_test_rocauc_score=evaluate_model(y_test, y_test_pred)
        
        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1)) 
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
        print('----------------------------------')

        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_test_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        accuracy_list.append(model_test_accuracy)
    
        print('='*35)
        print('\n')

Logistic Regression
Model performance for Training set
- Accuracy: 0.8953
- F1 score: 0.9244
- Precision: 0.9091
- Recall: 0.9402
- Roc Auc Score: 0.8701
----------------------------------
Model performance for Test set
- Accuracy: 0.8837
- F1 score: 0.9206
- Precision: 0.9062
- Recall: 0.9355
- Roc Auc Score: 0.8427


RandomForestClassifier
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8140
- F1 score: 0.8750
- Precision: 0.8485
- Recall: 0.9032
- Roc Auc Score: 0.7433




  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
