In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
df=pd.read_csv("/content/Dataset - Dataset.csv")

In [None]:
df.sample(10)

In [None]:
! pip install ydata-profiling


In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(
    df,
    title="EDA Report",
    explorative=True
)
profile.to_file("eda_report.html")

In [None]:
df=df.drop("student_id",axis=1)

In [None]:
df

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df

In [None]:
cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns

unique_cats = {
    col: df[col].unique()
    for col in cat_cols
}
unique_cats


In [None]:
value_maps = {
    "gender": {
        "Male": "M",
        "Female": "F",
        "FEMALE": "F",
        "Other":"O"
    },
    "scholarship": {
        "Yes": "Y",
        "No": "N",
        "Nope": "N"
    },
    "extra_curricular": {
        "Yes": "Y",
        "No": "N",
        "Nope": "N",
        "-":np.nan
    },
    "sports_participation": {
        "Yes": "Y",
        "No": "N",
        "Nope": "N"
    }
}

In [None]:
for col, mapping in value_maps.items():
    df[col] = df[col].replace(mapping)

In [None]:
df.sample(15)

In [None]:
df.info()

In [None]:
df.drop("parental_education",axis=1,inplace=True)

In [None]:
df["dropout"].value_counts()

In [None]:
df.head(15)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=["dropout"])   # features
y = df["dropout"]                  # label


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
X_train

In [None]:
full_null_rows = df[df.isna().all(axis=1)]
full_null_rows

In [None]:
threshold = int(0.5 * df.shape[1])   # 50% non-null required

rows_to_drop = df.isna().sum(axis=1) > (df.shape[1] - threshold)

rows_to_drop.any()

In [None]:
X_train.describe()

In [None]:
df=df[df["family_income"] >= 0]


In [None]:
df

In [None]:
X_train.describe()

In [None]:
num_cols = X_train.select_dtypes(include="number").columns
num_cols

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(X_train[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()


In [None]:
for col in num_cols:
    plt.figure(figsize=(5, 3))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()


In [None]:
X_train.info()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

# Custom Transformer for Value Mapping
class CustomValueMapper(BaseEstimator, TransformerMixin):
    def __init__(self, mapping_dict):
        self.mapping_dict = mapping_dict

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, mapping in self.mapping_dict.items():
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].replace(mapping)
        return X_copy


median_cols = ['attendance_rate', 'family_income']
mean_cols = ['age', 'cgpa', 'past_failures',
       'study_hours_per_week', 'assignments_submitted', 'projects_completed',
       'total_activities']
cat_cols = ['department', 'gender','scholarship','extra_curricular','sports_participation']

# Pipeline 1: Numerical (Outliers) -> Median + Standard Scaling
median_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline 2: Numerical (Normal) -> Mean + Standard Scaling
mean_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline 3: Categorical -> Custom Mapping + Most Frequent + OneHot (Drop First)
cat_pipeline = Pipeline([
    ('mapper', CustomValueMapper(value_maps)), # Add custom mapper here
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    # Note: handle_unknown='ignore' usually conflicts with drop='first' in older sklearn versions.
    # If error occurs, remove handle_unknown.
])

# Master Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num_median', median_pipeline, median_cols),
        ('num_mean', mean_pipeline, mean_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='drop' # Drop any columns not mentioned above
)

# ==========================================
# 3. Define Models to Spot Check
# ==========================================
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000,class_weight='balanced')),
    ('Decision Tree', DecisionTreeClassifier(class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(class_weight='balanced')),
    ('SVM', SVC(class_weight='balanced')),
    ('KNN', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss',scale_pos_weight=4))
]


In [None]:
results_list = []
scoring_metrics = ['accuracy', 'f1', 'precision', 'recall']

print("Running Cross-Validation on Training Set...\n")

for name, model in models:
    # Create a pipeline for each model so preprocessing happens INSIDE the fold
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Run Stratified K-Fold CV
    cv_results = cross_validate(
        model_pipeline,
        X_train,
        y_train,
        cv=5,
        scoring=scoring_metrics,
        n_jobs=-1 # Use all processors
    )

    # Store results
    results_list.append({
        'Model': name,
        'Accuracy': cv_results['test_accuracy'].mean(),
        'F1 Score': cv_results['test_f1'].mean(),
        'Precision': cv_results['test_precision'].mean(),
        'Recall': cv_results['test_recall'].mean()
    })

# ==========================================
# 5. Display Final Leaderboard
# ==========================================
results_df = pd.DataFrame(results_list)
results_df = results_df.sort_values(by='F1 Score', ascending=False)

print(results_df.round(4))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# --- Logistic Regression Grid ---
lr_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__penalty': ['l2'],
    'model__solver': ['lbfgs']
}

def print_best_cv_results(grid_search, model_name):
    print(f"\n--- {model_name} TUNING RESULTS ---")
    print(f"Best Params: {grid_search.best_params_}")

    # Get the index of the best model
    best_idx = grid_search.best_index_

    # Extract the mean scores for that specific winning model
    mean_f1 = grid_search.cv_results_['mean_test_f1'][best_idx]
    mean_prec = grid_search.cv_results_['mean_test_precision'][best_idx]
    mean_rec = grid_search.cv_results_['mean_test_recall'][best_idx]

    print(f"Best CV F1 Score:  {mean_f1:.4f}")
    print(f"Corresponding CV Precision: {mean_prec:.4f}")
    print(f"Corresponding CV Recall:    {mean_rec:.4f}")

# ==========================================
# 2. Run Tuning for Logistic Regression
# ==========================================
print("Tuning Logistic Regression...")

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=2000, class_weight='balanced'))
])

# Scoring: We track all 3, but 'refit' tells it to choose the winner based on 'f1'
grid_lr = GridSearchCV(
    lr_pipeline,
    lr_params,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    n_jobs=-1
)
grid_lr.fit(X_train, y_train)

# Print CV Results
print_best_cv_results(grid_lr, "Logistic Regression")

# Final Test Set Prediction
print("\n...Evaluating Logistic Regression on Test Set...")
y_pred_lr = grid_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))




In [None]:

print("\n" + "="*40 + "\n")
print("Tuning XGBoost...")


xgb_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__scale_pos_weight': [3, 3 * 1.5]
}

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

grid_xgb = GridSearchCV(
    xgb_pipeline,
    xgb_params,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    n_jobs=-1
)
grid_xgb.fit(X_train, y_train)

# Print CV Results
print_best_cv_results(grid_xgb, "XGBoost")

# Final Test Set Prediction
print("\n...Evaluating XGBoost on Test Set...")
y_pred_xgb = grid_xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

In [None]:
import joblib
joblib.dump(grid_xgb, 'risk_model.pkl')

In [None]:
from google.colab import files
files.download('risk_model.pkl')