# ðŸ“˜ End-to-End ML Project (30 Cells)
College Student Dataset

This notebook covers:
- Data cleaning
- EDA (all major graphs)
- Feature engineering
- Model training & evaluation
- Cross-validation

Fully working & beginner-safe.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')


In [None]:

df = pd.read_csv('college_student_management_data.csv')
df.head()


In [None]:

df.shape


In [None]:

df.columns


In [None]:

df.info()


In [None]:

df.describe()


In [None]:

df.isnull().sum()


In [None]:

for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)


In [None]:

df.isnull().sum()


In [None]:

df.duplicated().sum()


In [None]:

df.drop_duplicates(inplace=True)


In [None]:

le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])


In [None]:

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:

target_col = df.columns[-1]
target_col


In [None]:

sns.countplot(x=df[target_col])
plt.title('Target Distribution')
plt.show()


In [None]:

df.hist(figsize=(14,10))
plt.show()


In [None]:

for col in df.columns[:-1]:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(col)
    plt.show()


In [None]:

sns.pairplot(df.sample(min(200, len(df))), diag_kind='kde')
plt.show()


In [None]:

X = df.drop(target_col, axis=1)
y = df[target_col]


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
accuracy_score(y_test, lr_pred)


In [None]:

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
accuracy_score(y_test, dt_pred)


In [None]:

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
accuracy_score(y_test, rf_pred)


In [None]:

print(classification_report(y_test, rf_pred))


In [None]:

cm = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:

scores = cross_val_score(rf, X, y, cv=5)
scores.mean()


In [None]:

rf.fit(X, y)
accuracy_score(y, rf.predict(X))


## âœ… Project Summary
- Cleaned data
- Visualized using multiple plots
- Trained 3 ML models
- Random Forest performed best
- Cross-validation applied

## ðŸŽ“ Ready for submission / teaching / demo