In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report


In [None]:
df = pd.read_csv("Placement_Data_Full_Class.csv")

#dropping the sl_no column as it is not useful
df = df.drop(columns=['sl_no'],axis=1)

#mapping the target column
status_map = {'Not Placed': 0, 'Placed': 1}
df['status'] = df['status'].map(status_map)

## EDA

In [None]:
df.columns  

In [None]:
categorical_columns = ['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation','status']
numerical_columns = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary']

In [None]:
plt.figure(figsize=(25,15))
plt.suptitle("Distributions for Numerical Features",fontsize=50)
for i,column in enumerate(numerical_columns):
    plt.subplot(2,3,i+1)
    sns.histplot(df[column],kde=True,color='green')
    plt.title('{} distribution'.format(column))
plt.show()

In [None]:
plt.figure(figsize=(25,15))
plt.suptitle("Distributions for Categorical Features",fontsize=50)
for i,column in enumerate(categorical_columns):
    plt.subplot(2,4,i+1)
    counts = df[column].value_counts()
    plt.pie(counts,labels=counts.index,autopct='%1.1f%%', startangle=90)
    plt.title('{} distribution'.format(column))
    
plt.show()
    

In [None]:
#Correlation 
df_for_corr = df[['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary','status']]
plt.figure(figsize=(12,8)) 
sns.heatmap(df_for_corr.corr(), annot=True, cmap='coolwarm', linewidths = 2)
plt.show()

## Interesting finding : The correlation of salary and placement status is 0, upon looking at value counts of placement status and values where salaries were null, it was found that wherever the salary was null, placement status was 0, meaning no salary was given to the candidate by corporate which makes sense as the column description says "Salary offered by corporate to candidates". Therefore, if we drop the null values, we will get 100% accuracy because we will only have one class. Which is why, we will impute null salaries with a value of 0.

In [None]:
df[df['salary'].isnull()]

In [None]:
df['status'].value_counts()

In [None]:
#filling null values for salary with 0
df['salary'] = df['salary'].fillna(0)

## Data Encoding Categorical Columns

In [None]:
# columns to binary (0/1) encode => gender,ssc_b,hsc_b,workex,specialisation
# columns to one hot encode => hsc_s,degree_t

In [None]:
map_gender = {'M': 0, 'F': 1}
map_ssc_b = {'Others': 0, 'Central': 1}
map_hsc_b = {'Others': 0, 'Central': 1}
map_workex = {'No': 0, 'Yes': 1}
map_specialisation = {'Mkt&HR': 0, 'Mkt&Fin': 1}

df['gender'] = df['gender'].map(map_gender)
df['ssc_b'] = df['ssc_b'].map(map_ssc_b)
df['hsc_b'] = df['hsc_b'].map(map_hsc_b)
df['workex'] = df['workex'].map(map_workex)
df['specialisation'] = df['specialisation'].map(map_specialisation)

In [None]:
#one hot encoding

one_hot_cols = ['hsc_s','degree_t']
for col in one_hot_cols:
    dummy_column_df = pd.get_dummies(df[col],prefix='one_hot_')*1
    df = df.drop(col,axis=1)
    df = pd.concat([df,dummy_column_df],axis=1)


## Train Test Split and Scaling the data

In [None]:
X = df.drop('status',axis=1)
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=101)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X

## Training different neural network architectures

In [None]:
model_architecture_options = {
    'model_1' : MLPClassifier(hidden_layer_sizes=(17,17,17,17),max_iter=200),
    'model_2' : MLPClassifier(hidden_layer_sizes=(5,5,23,25),max_iter=200),
    'model_3' : MLPClassifier(hidden_layer_sizes=(17,17,8,8,17,17),max_iter=200),
    'model_4' : MLPClassifier(hidden_layer_sizes=(8,8,17,17,8),max_iter=200)
}

In [None]:
for model_type in model_architecture_options:

    model = model_architecture_options[model_type]
    model.fit(X_train,y_train)

    preds = model.predict(X_test)
    


    print(f'Confusion Matrix: \n{confusion_matrix(preds, y_test)}\n')
    print(f'Classification Report: \n{classification_report(preds, y_test)}\n')
    print(f'Accuracy Score: \n{accuracy_score(preds, y_test)}\n')

    #model_metrics[str(model)] = {confusion_matrix : confusion_matrix(preds, y_test),classification_report:classification_report(preds, y_test),accuracy_score:accuracy_score(preds, y_test)}
    print('***********************************************')

## Training Different Descition Tree's

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from graphviz import Source
from IPython.display import SVG

### With Salary

In [None]:
feature_cols = ['gender', 'ssc_p', 'ssc_b', 'degree_p', 'salary', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=5)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

### 3-5 with a few

In [None]:
feature_cols = ['gender', 'ssc_p', 'ssc_b', 'degree_p', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=3)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

In [None]:
feature_cols = ['gender', 'ssc_p', 'ssc_b', 'degree_p', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=4)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

In [None]:
feature_cols = ['gender', 'ssc_p', 'ssc_b', 'degree_p', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=5)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

### One Hot Encoded

In [None]:
feature_cols = ['one_hot__Arts', 'one_hot__Commerce', 'one_hot__Science', 'one_hot__Comm&Mgmt', 'one_hot__Others', 'one_hot__Sci&Tech']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=3)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

In [None]:
feature_cols = ['one_hot__Arts', 'one_hot__Commerce', 'one_hot__Science', 'one_hot__Comm&Mgmt', 'one_hot__Others', 'one_hot__Sci&Tech']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=4)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

In [None]:
feature_cols = ['one_hot__Arts', 'one_hot__Commerce', 'one_hot__Science', 'one_hot__Comm&Mgmt', 'one_hot__Others', 'one_hot__Sci&Tech']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=5)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

### Best 2 From Each Previouse

In [None]:
feature_cols = ['one_hot__Commerce', 'one_hot__Others', 'ssc_p', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=5)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

### All Features

In [None]:
feature_cols = ['gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'degree_p', 'workex', 'etest_p', 'specialisation', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=5)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

In [None]:
feature_cols = ['gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'degree_p', 'workex', 'etest_p', 'specialisation', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=10)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

### Best 4

In [None]:
feature_cols = ['ssc_p', 'hsc_p', 'degree_p', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=10)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

In [None]:
feature_cols = ['ssc_p', 'hsc_p', 'degree_p', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=5)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

In [None]:
feature_cols = ['ssc_p', 'hsc_p', 'degree_p', 'mba_p']
X = df[feature_cols]

treeclf = DecisionTreeClassifier(max_depth=3)
treeclf.fit(X, y)
graph = Source(tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1'], filled = True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})