# Part-1: Data Exploration and Pre-processing

### Installing libraries

In [1]:
#for plotting missing values
!pip install missingno
!pip install xgboost



### Importing Dependencies

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as missing

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix

### Importing Data

In [None]:
df=pd.read_csv('collegePlace.csv')

In [None]:
#5 sample rows of data
df.sample(5)

### Shape & Size of Data

In [None]:
df.shape

In [None]:
#print(2966*8)
df.size

### Name of Columns

In [None]:
df.columns

### Data Types of Features

In [None]:
df.dtypes

**Observations : Two features are categorical type, we will encode them later.**

In [None]:
df.info()

### Unique number of values in a particular feature

In [None]:
df.nunique()

### Value Counts

In [None]:
#percent of all categories in categorical feature 
for col in df.columns:
    print(f'{col}\n{(df[col].value_counts()/df.shape[0])*100}\n')

### Missing Values

In [None]:
df.isnull().sum()

**Observations : There are no missing values.**

In [None]:
missing.bar(df, color=(0,0,0))
plt.title('Non-Missing Values', size=45, y=1.15)

### Duplicate rows

In [None]:
df.duplicated().sum()

In [None]:
#df.drop_duplicates(inplace=True)

In [None]:
df.shape

### Statistical Description of Numerical & Categorical Features

In [None]:
round(df.describe(exclude = 'object'), 2)

In [None]:
round(df.describe(exclude = ['float', 'int64']), 2)

### Correlation 

In [None]:
df.corr()

##### Observations : CGPA & Internships looks highly correlated with Placements

### Exploratory Data Analysis

#### UNIVARIATE ANALYSIS

##### Find distribution of age, mean age of students

In [None]:
fig = px.histogram(df, 'Age',
                   title="<b>Age of Students</b>")

fig.add_vline(x=df['Age'].mean(), line_width=2, line_dash="dash", line_color="red")
fig.add_vline(x=df['Age'].median(), line_width=2, line_dash="dash", line_color="yellow")
fig.show()

##### Observations : 
1. Highest number of age students are of 21
2. Since Age is positively skewed, mean>median
3. There seems to have some outliers of age more than 24 (isn't that strange, Engineering students with this age)

In [None]:
print(f'Mean - {df.Age.mean()}')
print(f'Median - {df.Age.median()}')
print(f'Mode-{df.Age.mode()}')

##### Find ratio of males and females

In [None]:
px.pie(title="<b>Gender Ratio</b>", names='Gender', data_frame=df, color='Gender', hole=0.5, template = "plotly_dark")

##### Observations : Most of the students are Male, very few are females.

##### Display word cloud for streams taken by students

In [None]:
#plot_word_cloud(data, "Stream")

In [None]:
px.histogram(x='Stream', data_frame=df, color='Stream')

##### Observations : Most of the students belongs to Computer Science and very few from Civil Engineering.

##### Find distribution of internships done by students

In [None]:
px.histogram(x='Internships', data_frame=df, color='Internships')

##### Observations : 
1. Highest number of students haven't done any internship.
2. very few students have done more than one internship

##### Find distribution of CGPA acquired by students in their Engineering

In [None]:
px.histogram(x='CGPA', data_frame=df, color='CGPA')

##### Observations : 
1. Highest number of students have 7 CGPA, second highest is 8 CGPA
2. Very few students are there having 9 CGPA

##### Find ratio of students living in hostel or not

In [None]:
px.pie(title="<b>Hostlers Ratio</b>", names='Hostel', data_frame=df, color='Hostel', hole=0.5, template = "plotly_dark")

##### Observations : Most of the students are local residents & very few stay in hostels.

##### Find ratio of students having backlogs or not

In [None]:
px.pie(title="<b>Backlogs Ratio</b>", names='HistoryOfBacklogs', data_frame=df, color='HistoryOfBacklogs', hole=0.5, template = "plotly_dark")

##### Observations : Very few students have backlogs(sounds great!)

##### Find ratio of students got placed to those who did not got placed

In [None]:
px.histogram(x='PlacedOrNot', data_frame=df, color='PlacedOrNot')
px.pie(title="<b>Placement Ratio</b>", names='PlacedOrNot', data_frame=df, color='PlacedOrNot', hole=0.5, template = "plotly_dark")

##### Observation : Ratio of students getting placed versus those who didn't got placed is approximately same, hence its almost balanced.

#### BIVARIATE ANALYSIS

##### Find gender-wise average age

In [None]:
#gender wise average age
fig = px.histogram(df, 'Age',             
                   color = "Gender",
                   marginal = 'violin',
                   title = "<b>Average Age Gender wise</b>")

fig.update_traces(marker = {"opacity": 0.7})

fig.add_vline(x = df['Age'].mean(),
              line_width = 2,
              line_dash = "dash",
              line_color = "black")

fig.show()

##### Observations : 
1. Females are more in number than males in every age category.

##### What is the effect of age on getting placed

In [None]:
#effect of age on getting placed
fig = px.histogram(df, x="Age", color ="PlacedOrNot", pattern_shape="PlacedOrNot",
                   template='plotly_dark', barmode='group',color_discrete_sequence=['blue'])

fig.update_layout(bargap=0.2)

fig.show()

##### Observations : 
1. Students having ages "21" & "22" have more chances of getting placed through campus placements.
2. Students whose age is between 28-30 have fewer chances of getting placed.

##### How gender is affecting placement

In [None]:
#How gender is affecting placement
fig = px.histogram(df, x="Gender", color ="PlacedOrNot", pattern_shape="PlacedOrNot",
                   template='plotly_dark', barmode='group',color_discrete_sequence=['blue'])

fig.update_layout(bargap=0.2)

fig.show()

##### Observations : Generally males are getting placed more than females

##### Which stream students have best and worst placement records

In [None]:
#Which stream students have best and worst placement records
fig = px.histogram(data_frame = df,
             x = "Stream",
             color="PlacedOrNot", title="<b>Counts of Stream</b>",
             template='plotly_dark')

fig.update_layout(title_x = 0.5,
                  title_font = dict(size = 20),
                  uniformtext_minsize = 15)


fig.show()

##### Observations : 
1. Most of the students who got placed are from Computer Science and Information Technology Stream.
2. Very few students from the "Civil" and "Electrical" stream and got placed successfully.

##### How the number of internships done is affecting placement

In [None]:
#How the number of internships done is affecting placement
fig = px.histogram(df, x="Internships", color ="PlacedOrNot", pattern_shape="PlacedOrNot",
                   template='plotly_dark', barmode='group',color_discrete_sequence=['blue'])

fig.update_layout(bargap=0.2)

fig.show()

##### Observations :
1. Most of students who have done 1 or even no internships have got placed
2. very few students have done more than 1 internship and yes got successfully plaed

In [None]:
zero_internships = df[df['Internships']==0]

In [None]:
fig = px.histogram(data_frame=zero_internships, x='PlacedOrNot', color='PlacedOrNot', 
                  title = "<b>Zero Internship Experience Vs Placement</b>",
                   template = 'plotly_dark')
fig.update_layout(bargap = 0.2)
fig.show()

##### Observations : 
1. Even after doing no internship, ratio of getting placed or not is almost the same.

##### What role CGPA is playing in getting placed

In [None]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
plt.title('Placement Distribution By CGPA', size=28)

ax.text(9,0.5, 'Placed', {'fontproperties': 'Serif','size': '20','weight': 'bold','color': 'orange'}, alpha=0.9)
ax.text(4.5, 0.5, 'Not Placed', {'fontproperties': 'Serif','size': '20','weight': 'bold','color': 'blue'}, alpha=0.9)

sns.kdeplot(data=df[df['PlacedOrNot']==1], x='CGPA', shade=True, ax=ax, color='orange', alpha=1)
sns.kdeplot(data=df[df['PlacedOrNot']==0], x='CGPA', shade=True, ax=ax, color='blue', alpha=0.5)

plt.xlabel("CGPA", {'fontproperties': 'Serif','size': '25','weight': 'bold'})
plt.show()

##### Observations : 
1. 100% students who have CGPA > 7 got placed
2. approximately no students who have CGPA <6 got placed
3. Very few students having CGPA 6 or 7 got placed

In [None]:
#What role CGPA is playing in getting placed
cgpa_above_avg = df[df['CGPA'] > df['CGPA'].mean()]
cgpa_below_avg = df[df['CGPA'] <= df['CGPA'].mean()]

In [None]:
cgpa_above_avg

In [None]:
fig = px.histogram(data_frame=cgpa_above_avg, x="CGPA", 
                   color ="PlacedOrNot",
                   title = "<b>Above Average CGPA Vs Placement</b>",
                   pattern_shape="PlacedOrNot",
                   template='plotly_dark', barmode='group',color_discrete_sequence=['blue'])

fig.update_layout(bargap=0.2)

fig.show()

##### Observations : 100% students who have CGPA > 7 got placed

In [None]:
fig = px.histogram(data_frame = cgpa_below_avg,
                   x = 'CGPA',
                   color='PlacedOrNot',
                   title = "<b>Below Average CGPA Vs Placement</b>",
                   template='plotly_dark', barmode='group')

#fig.update_layout(bargap=0.2)

fig.show()

##### Observations : Very few students having CGPA 5 got placed

##### Does living in hostel has an impact on getting placed

In [None]:
#Does living in hostel has an impact on getting placed
fig = px.histogram(df, x='Hostel', color='PlacedOrNot', barmode='group')
fig.update_layout(bargap=0.2)
fig.show()

**Observations : Local resedentials are the ones who got higher placements, while hostlers are approximately equal in terms of getting placed or not.**

##### What is the effect of backlogs on placements

In [None]:
#what is the effect of backlogs on placements
fig = px.histogram(df, x='HistoryOfBacklogs', color='PlacedOrNot', barmode='group')
fig.update_layout(bargap=0.2)
fig.show()

**Observations : Most of the students who don't have backlogs got placed.**

#### Multivariate Analysis

In [None]:
stream_wise = df.groupby('Stream').agg({'Age':'mean',
                                          'Internships' : 'sum',                            
                                           "CGPA":'mean',
                                           'PlacedOrNot':'sum'})

stream_wise.style.highlight_max()

In [None]:
px.bar(data_frame=stream_wise, barmode='group',
       title = "<b>Stream wise Analyzing</b>",template="plotly_dark")

**Observations : CS & IT students have done maximum internships and got maximum placements.**

##### Pairplot

In [None]:
#Display pair plot between all the features
sns.pairplot(df,hue = 'PlacedOrNot',size = 3,palette='Blues_r')
plt.show()

#### Heatmap

In [None]:
#heatmap
sns.heatmap(df.corr(), annot=True)

**Observations : CGPA & Internships are highly correlated with Placements.**

# Part-2: Working with models

#### Perform encoding on stream features

In [None]:
print(df.Gender.unique())
print(df.Gender.nunique())

In [None]:
print(df.Stream.unique())
print(df.Stream.nunique())

In [None]:
gender_dummy = pd.get_dummies(df['Gender'], drop_first=True)
stream_dummy = pd.get_dummies(df['Stream'], drop_first=True)

In [None]:
gender_dummy

In [None]:
stream_dummy

In [None]:
df = pd.concat([df.drop(['Gender', 'Stream'], axis=1), gender_dummy, stream_dummy], axis=1)
df

##### Rearrange features

In [None]:
df = df[['Age', 'Male',
             'Electronics And Communication',
             'Computer Science', 'Information Technology',
             'Mechanical', 'Electrical',
             "Internships","CGPA",'Hostel',
             'HistoryOfBacklogs', 'PlacedOrNot']]

df

#### Scaling the dataset

In [None]:
scaler = StandardScaler()
scaler.fit(df.drop('PlacedOrNot',axis=1))
scaled_features = scaler.transform(df.drop('PlacedOrNot',axis=1))
scaled_features

In [None]:
df_predictors = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_predictors

#### Visualize correlation among the features, also check if there is multicollinearity present in data

In [None]:
#heatmap
fig = plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

##### Observation : Internship and CGPA is highly correlated with dependent feature i.e PlacedOrNot

#### Split the data into train and test and check the shapes

In [None]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(df_predictors, df['PlacedOrNot'], test_size = 0.25, random_state = 0)

In [None]:
print(f'Shape of X_train - {X_train.shape}\nShape of X_test - {X_test.shape}\nShape of y_train - {y_train.shape}\nShape of y_test - {y_test.shape}')

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

#### Apply Logistic regression, Decision Tree, Random Forest, Xg-Boost, K Neighbors Classifer

In [None]:
models = {"LogisticRegression":LogisticRegression(),
         "DecisionTree":DecisionTreeClassifier(),
         "RandomForest":RandomForestClassifier(),
         "XgBoost": XGBClassifier(),
         "KNeighborsClassifier":KNeighborsClassifier()}

In [None]:
model_list = [("LogisticRegression", LogisticRegression()),
         ("DecisionTree", DecisionTreeClassifier()),
         ("RandomForest", RandomForestClassifier()),
         ("XgBoost", XGBClassifier()),
         ("KNeighborsClassifier", KNeighborsClassifier())]

#### Visualize model scores of all the algorithms applied

In [None]:
# accuracy score on train dataset for all models  
for model_name, model in model_list:
    m = model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    print(f'{model_name} : {accuracy_score(y_train,y_pred)}')    

In [None]:
# accuracy score on test dataset for all models  
for model_name,model in model_list:
    m = model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f'{model_name} : {accuracy_score(y_test,y_pred)}')

#### Perform Hyper parameter tuning 

In [None]:
algos = {
    'Random Forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[10,20,30,40],
            'criterion':['gini','entropy'],
            'max_depth':[10,20,30],
            'min_samples_split':[2,4,6]
        }
    },
    'Decision Tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy'],
            'max_depth':[10,20,30],
            'splitter':['best','random'],
        }

    }
    
}

scores = []

cv = ShuffleSplit(n_splits=10,test_size=0.2,random_state=42)

for model_name,config in algos.items():
   gd = GridSearchCV(estimator=config['model'], param_grid=config['params'], cv=cv, return_train_score=False)
   gd.fit(df_predictors, df['PlacedOrNot'])
   scores.append({'model_name':model_name,'best_score':gd.best_score_,'best_params':gd.best_params_})

scores = pd.DataFrame(scores)
scores.head()

#### Check for best estimator and hyper parameter 

In [None]:
gd.best_estimator_

In [None]:
gd.best_params_

#### Training model with best hyper parameters

In [None]:
classifier = DecisionTreeClassifier(max_depth=10, criterion='gini', splitter='best')
classifier

In [None]:
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

####  Plot Confusion matrix

In [None]:
plot_confusion_matrix(classifier,
                      X_test, y_test,
                      cmap = plt.cm.Blues,
                      display_labels = ['Not Placed', 'Placed'])
plt.grid(False)
plt.show();

##### Accuracy = (TP+FP)/(TP+FP+TN+FN)

In [None]:
#Accuracy
(312+349)/(312+17+349+64)*100

##### Precision = TP/(TP+FP)
Focuses on False Positives

In [None]:
349/(349+17)*100

##### Recall= TP/(TP+FN)
Focuses on False Negatives

In [None]:
349/(349+64)*100

##### Observation : 

**END OF DOCUMENT**