# Assignment 2

### Imports

In [1]:
import pandas as  pd

## 1 - Data Handling

### Loading Dataset

In [2]:
# Loading dataset
df = pd.read_csv('college_student_placement_dataset.csv')


### Initial Checks

In [3]:
# Previewing dataset
df.head()

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107,6.61,6.28,8,No,8,8,4,No
1,CLG0061,97,5.52,5.37,8,No,7,8,0,No
2,CLG0036,109,5.36,5.83,9,No,3,1,1,No
3,CLG0055,122,5.47,5.75,6,Yes,1,6,1,No
4,CLG0004,96,7.91,7.69,7,No,8,10,2,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   College_ID              10000 non-null  object 
 1   IQ                      10000 non-null  int64  
 2   Prev_Sem_Result         10000 non-null  float64
 3   CGPA                    10000 non-null  float64
 4   Academic_Performance    10000 non-null  int64  
 5   Internship_Experience   10000 non-null  object 
 6   Extra_Curricular_Score  10000 non-null  int64  
 7   Communication_Skills    10000 non-null  int64  
 8   Projects_Completed      10000 non-null  int64  
 9   Placement               10000 non-null  object 
dtypes: float64(2), int64(5), object(3)
memory usage: 781.4+ KB


In [5]:

# Check for duplicates
print("Duplicate rows:", df.duplicated().sum())


# Checking missed values
df.isnull().sum().sort_values(ascending=False)

Duplicate rows: 0


College_ID                0
IQ                        0
Prev_Sem_Result           0
CGPA                      0
Academic_Performance      0
Internship_Experience     0
Extra_Curricular_Score    0
Communication_Skills      0
Projects_Completed        0
Placement                 0
dtype: int64

### Handling Missing Values & Duplicates

In [6]:

df.dropna(inplace=True) #drop missing values if exist (but not exist in my case)

df = df.drop_duplicates() #Removing if any duplicate (But no duplicate in my case)

### Encoding Categorical Features

In [7]:

#Removing Extra spaces of columns
df['Internship_Experience'] = df['Internship_Experience'].str.strip()
df['Placement'] = df['Placement'].str.strip()

# Converting 'Yes'/'No' to 1/0 for Internship Experience
df['Internship_Experience'] = df['Internship_Experience'].map({'Yes':1 ,'No':0})


# Converting 'Yes'/'No' to 1/0 for Placement
df['Placement'] = df['Placement'].map( {'Yes':1,'No':0})    

df.head()                                                    

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107,6.61,6.28,8,0,8,8,4,0
1,CLG0061,97,5.52,5.37,8,0,7,8,0,0
2,CLG0036,109,5.36,5.83,9,0,3,1,1,0
3,CLG0055,122,5.47,5.75,6,1,1,6,1,0
4,CLG0004,96,7.91,7.69,7,0,8,10,2,0


### Dropping not useful data

In [8]:
df = df.drop('College_ID', axis = 1)


## 2 - Exploratory Data Analysis

### Basic statistics

In [11]:
import numpy as np 
# Showing summary statistics for all columns
df.describe()

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,99.4718,7.535673,7.532379,5.5464,0.3964,4.9709,5.5618,2.5134,0.1659
std,15.053101,1.447519,1.470141,2.873477,0.489174,3.160103,2.900866,1.715959,0.372009
min,41.0,5.0,4.54,1.0,0.0,0.0,1.0,0.0,0.0
25%,89.0,6.29,6.29,3.0,0.0,2.0,3.0,1.0,0.0
50%,99.0,7.56,7.55,6.0,0.0,5.0,6.0,3.0,0.0
75%,110.0,8.79,8.77,8.0,1.0,8.0,8.0,4.0,0.0
max,158.0,10.0,10.46,10.0,1.0,10.0,10.0,5.0,1.0


### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#Count of students plaved vs  not placed
sns.countplot(x='Placement', data=df)
plt.title('Placement Count')

: 

### Interactive Plot (Plotly)

In [None]:
import plotly.express as px

# Interactive bar chart: Placement by Internship Experience
fig = px.bar(df, x='Internship_Experience', y='Placement', color='Internship_Experience', barmode='group')
fig.update_layout(title='Placement by Internship Experience')
fig.show()

: 

## 3 - Feature Engineering

### Splitting Data

In [None]:
# Features: all columns except 'Placement'
X = df.drop('Placement', axis=1)
# Target: 'Placement' column
y = df['Placement']

### Normalizing Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## 4- Model Training

### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

## 5 - Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

models = {
    "KNN": knn_pred,
    "Decision Tree": dt_pred,
    "Random Forest": rf_pred
}

for name, pred in models.items():
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, pred))
    print("Precision:", precision_score(y_test, pred))
    print("Recall:", recall_score(y_test, pred))
    print("F1 Score:", f1_score(y_test, pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
    print("Classification Report:\n", classification_report(y_test, pred))
    print("\n")

# ROC Curve for the best model (Random Forest as example)
rf_probs = rf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, rf_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'Random Forest (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

## 6 - Feature Importance

### Extracting & Visualizing Feature Importance

In [None]:
# Get feature importances from Random Forest
import pandas as pd
import matplotlib.pyplot as plt

feature_importances = rf.feature_importances_
feature_names = X.columns

# Create a DataFrame for easy plotting
fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
fi_df = fi_df.sort_values('Importance', ascending=False)

# Plot
plt.figure(figsize=(8,5))
plt.barh(fi_df['Feature'], fi_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance from Random Forest')
plt.gca().invert_yaxis()
plt.show()

## 7 - Hyperparameter Tunning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# KNN tuning
knn_params = {
    'n_neighbors': range(1, 21),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
knn_search = RandomizedSearchCV(KNeighborsClassifier(), knn_params, n_iter=10, cv=5, random_state=42)
knn_search.fit(X_train, y_train)
knn_best = knn_search.best_estimator_
knn_best_pred = knn_best.predict(X_test)

# Decision Tree tuning
dt_params = {
    'max_depth': range(1, 21),
    'min_samples_split': range(2, 11)
}
dt_search = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), dt_params, n_iter=10, cv=5, random_state=42)
dt_search.fit(X_train, y_train)
dt_best = dt_search.best_estimator_
dt_best_pred = dt_best.predict(X_test)

# Random Forest tuning
rf_params = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': range(1, 21),
    'min_samples_split': range(2, 11)
}
rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=10, cv=5, random_state=42)
rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_
rf_best_pred = rf_best.predict(X_test)

print("Best KNN:", knn_search.best_params_)
print("Best Decision Tree:", dt_search.best_params_)
print("Best Random Forest:", rf_search.best_params_)

## 6- Evaluating Tuned Models

In [None]:
print("\n--- Tuned Model Results ---")
tuned_models = {
    "KNN (Tuned)": knn_best_pred,
    "Decision Tree (Tuned)": dt_best_pred,
    "Random Forest (Tuned)": rf_best_pred
}

for name, pred in tuned_models.items():
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, pred))
    print("Precision:", precision_score(y_test, pred))
    print("Recall:", recall_score(y_test, pred))
    print("F1 Score:", f1_score(y_test, pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
    print("Classification Report:\n", classification_report(y_test, pred))
    print("\n")

# ROC Curve for the best tuned model (Random Forest as example)
rf_best_probs = rf_best.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, rf_best_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'Random Forest Tuned (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Tuned Random Forest)')
plt.legend(loc='lower right')
plt.show()