<a href="https://colab.research.google.com/github/Nemit-jindal/Admission_ML/blob/main/Admission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import numpy as np
import pandas as pd
import joblib
import shap
import warnings
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

In [38]:
df=pd.read_csv("combined.csv")

In [39]:
df.shape

(411849, 51)

In [40]:
df

Unnamed: 0,id,activation_status,projected_admission_year,utm_source,tag_name,program_data.name,campus_name,graduation_year,graduation_degree,graduation_specialisation,...,highest_qualification,scholarship_percentage,jee_score,gaurdian_name,gaurdian_no,guardian_relationship,gaurdian_occupation,category,seat_booked,refund
0,3985,auto_activated,2023,LY_Churn_24,Resurrected,MBA,,,,,...,Graduation,,,,,,,1,0,0
1,4427,auto_activated,2023,LY_Churn_24,Archive,MBA,UIM,2021.0,BA,,...,Graduation,,,,,,,,0,0
2,5682,auto_activated,2023,LY_Churn_24,Resurrected,MBA,,2021.0,BSc,,...,Graduation,,,,,,,1,0,0
3,6443,auto_activated,2023,LY_Churn_24,Archive,MBA,,,,,...,Graduation,,,,,,,,0,0
4,7656,auto_activated,2023,LY_Churn_24,Archive,MBA,,2021.0,BBA,,...,Graduation,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411844,3632000,auto_activated,2025,aff_1626Camp,Live,BTECH,SAGEB,,,,...,Pursuing 12th,,,pradeep Dubey,767bd7f46f5a28ca9aca76fed21915c6,Father,Other,1,0,0
411845,3632002,auto_activated,2025,aff_7116STUD,Live,BCA,,,,,...,,,,,,,,1,0,0
411846,3632012,auto_activated,2025,aff_3110TW_C,Live,MBA,,,,,...,,,,,,,,1,0,0
411847,3632018,auto_activated,2025,aff_1468Aija,Live,BTECH,,,,,...,,,,,,,,1,0,0


In [41]:
df = df.drop(columns=["tag_name"], errors="ignore")

In [42]:
df.shape

(411849, 50)

In [43]:
# 🔍 Handle Missing Values
numerical_cols = df.select_dtypes(include=["number"]).columns
categorical_cols = df.select_dtypes(include=["object"]).columns

In [44]:
# Fill missing values in numerical columns with mean
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())


In [45]:
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]))


In [46]:
df[categorical_cols] = df[categorical_cols].astype(str)


In [47]:
df_train, df_manual = train_test_split(df, test_size=0.1, random_state=42)


In [48]:
# 🎯 Define Features & Target
X = df_train.drop(columns=["seat_booked"])  # Drop only the target column
y = df_train["seat_booked"]

In [49]:
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])

In [50]:
# ⚖ Split Train-Test Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [51]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test.astype(np.float32))

In [52]:
gbc = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42)
gbc.fit(X_train_scaled, y_train)


In [53]:
explainer = shap.Explainer(gbc, X_train_scaled)
shap_values = explainer(X_train_scaled)
feature_importance = np.abs(shap_values.values).mean(axis=0)



In [54]:
top_10_indices = np.argsort(feature_importance)[-10:]
selected_features = X.columns[top_10_indices]


In [55]:
X_train_top10 = X_train[selected_features]
X_test_top10 = X_test[selected_features]


In [56]:
scaler_top10 = StandardScaler()
X_train_top10_scaled = scaler_top10.fit_transform(X_train_top10.astype(np.float32))
X_test_top10_scaled = scaler_top10.transform(X_test_top10.astype(np.float32))

In [57]:
gbc.fit(X_train_top10_scaled, y_train)

In [58]:
y_prob = gbc.predict_proba(X_test_top10_scaled)[:, 1]  # Probability of class 1


In [60]:
auc_score = roc_auc_score(y_test, y_prob)
print(f"🎯 AUC Score: {auc_score:.4f}")
print(classification_report(y_test, gbc.predict(X_test_top10_scaled)))

🎯 AUC Score: 0.9966
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71874
           1       0.93      0.86      0.89      2259

    accuracy                           0.99     74133
   macro avg       0.96      0.93      0.95     74133
weighted avg       0.99      0.99      0.99     74133



In [63]:
model_path = "/content/gbc_model.pkl"
features_path = "/content/selected_features.npy"
predictions_path = "/content/predictions.csv"

# Save model
joblib.dump(gbc, model_path)
print(f"✅ Model saved at: {model_path}")

# Save selected features
np.save(features_path, selected_features)
print(f"✅ Selected features saved at: {features_path}")

# Save predictions
predictions_df = pd.DataFrame({"Actual": y_test, "Predicted_Probability": y_prob})
predictions_df.to_csv(predictions_path, index=False)
print(f"✅ Predictions saved at: {predictions_path}")

print("\n🎯 All files successfully saved in /content/")

✅ Model saved at: /content/gbc_model.pkl
✅ Selected features saved at: /content/selected_features.npy
✅ Predictions saved at: /content/predictions.csv

🎯 All files successfully saved in /content/


***Selected 10 features***

In [64]:
# Define the file path
features_path = "/content/selected_features.npy"

try:
    # Load the selected features correctly
    selected_features = np.load(features_path, allow_pickle=True)

    # Ensure it's an iterable list/array
    if isinstance(selected_features, np.ndarray):
        selected_features = selected_features.tolist()

    # Print the top 10 influential features
    print("🔝 Top 10 Most Influential Features:")
    for i, feature in enumerate(selected_features[:10], 1):
        print(f"{i}. {feature}")

except Exception as e:
    print(f"❌ Error loading selected features: {e}")

🔝 Top 10 Most Influential Features:
1. program_data.name
2. xii_year
3. xii_percentage
4. gaurdian_name
5. id
6. campus_name
7. x_year
8. activation_status
9. projected_admission_year
10. utm_source


In [65]:
gbc = joblib.load("/content/gbc_model.pkl")

# Load feature names
selected_features = np.load("/content/selected_features.npy", allow_pickle=True).tolist()

# Get feature importances from the trained model
feature_importances = gbc.feature_importances_

# Combine feature names with their importance
feature_importance_dict = dict(zip(selected_features, feature_importances))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the top 10 most influential features
print("🔝 Top 10 Features Influencing Probability:")
for i, (feature, importance) in enumerate(sorted_features[:10], 1):
    print(f"{i}. {feature}: {importance:.4f}")

🔝 Top 10 Features Influencing Probability:
1. utm_source: 0.4996
2. campus_name: 0.3501
3. projected_admission_year: 0.0692
4. activation_status: 0.0490
5. x_year: 0.0156
6. id: 0.0063
7. xii_year: 0.0036
8. xii_percentage: 0.0026
9. program_data.name: 0.0021
10. gaurdian_name: 0.0020
