<a href="https://colab.research.google.com/github/Himnshii/Machine-Learning/blob/main/celebal%20assignment%206/celebal_assignment_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from scipy.stats import uniform
import warnings
warnings.filterwarnings("ignore")


In [12]:
df = pd.read_csv("/content/StudentsPerformance.csv")

In [15]:
# Feature engineering
df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
def performance_label(score):
    if score <= 60:
        return 'Low'
    elif score <= 80:
        return 'Medium'
    else:
        return 'High'

df['performance'] = df['average_score'].apply(performance_label)
df.drop(columns=['math score', 'reading score', 'writing score', 'average_score'], inplace=True)

In [16]:
# Encode features and target
X = pd.get_dummies(df.drop(columns='performance'), drop_first=True)
le = LabelEncoder()
y = le.fit_transform(df['performance'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
# Initial model training
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'DecisionTree': DecisionTreeClassifier()
}

print("Initial Model Performance:")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")


Initial Model Performance:

LogisticRegression
Accuracy: 0.5450
Precision: 0.5190
Recall: 0.5450
F1 Score: 0.5012

RandomForest
Accuracy: 0.4900
Precision: 0.4654
Recall: 0.4900
F1 Score: 0.4728

SVM
Accuracy: 0.5300
Precision: 0.4393
Recall: 0.5300
F1 Score: 0.4578

KNN
Accuracy: 0.4850
Precision: 0.4522
Recall: 0.4850
F1 Score: 0.4636

DecisionTree
Accuracy: 0.4750
Precision: 0.4732
Recall: 0.4750
F1 Score: 0.4736


In [18]:
# Hyperparameter Tuning - Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid_rf,
                       cv=5,
                       scoring='f1_weighted',
                       n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)


In [19]:
# Hyperparameter Tuning - SVM
param_dist_svm = {
    'C': uniform(loc=0.1, scale=10),
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

random_svm = RandomizedSearchCV(SVC(),
                                param_distributions=param_dist_svm,
                                n_iter=10,
                                cv=5,
                                scoring='f1_weighted',
                                n_jobs=-1,
                                random_state=42)
random_svm.fit(X_train_scaled, y_train)

In [20]:
# Evaluation of tuned models
print("\nTuned Random Forest Best Params:", grid_rf.best_params_)
print("Tuned Random Forest CV F1 Score:", grid_rf.best_score_)

print("\nTuned SVM Best Params:", random_svm.best_params_)
print("Tuned SVM CV F1 Score:", random_svm.best_score_)

print("\nFinal Evaluation on Test Set:")
print("\nRandom Forest Test Report:")
print(classification_report(y_test, grid_rf.best_estimator_.predict(X_test_scaled), target_names=le.classes_))

print("\nSVM Test Report:")
print(classification_report(y_test, random_svm.best_estimator_.predict(X_test_scaled), target_names=le.classes_))



Tuned Random Forest Best Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}
Tuned Random Forest CV F1 Score: 0.4689153435878007

Tuned SVM Best Params: {'C': np.float64(0.6808361216819946), 'gamma': 'auto', 'kernel': 'linear'}
Tuned SVM CV F1 Score: 0.4629677512731546

Final Evaluation on Test Set:

Random Forest Test Report:
              precision    recall  f1-score   support

        High       0.15      0.05      0.08        39
         Low       0.53      0.36      0.42        59
      Medium       0.54      0.77      0.63       102

    accuracy                           0.51       200
   macro avg       0.41      0.39      0.38       200
weighted avg       0.46      0.51      0.46       200


SVM Test Report:
              precision    recall  f1-score   support

        High       0.00      0.00      0.00        39
         Low       0.60      0.44      0.51        59
      Medium       0.54      0.83      0.66       102

    accurac