In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error


# 1. LOAD THE DATASET

df = pd.read_csv('./data/categorized_specializations.csv')

df.drop(columns=['program','ugCollege','univName_rank','acceptance_rate','univ_state'],axis=1, inplace=True)

# Label encoding categorical columns
def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

# Scaling continuous columns
def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler


# 2. MEMORY-BASED COLLABORATIVE FILTERING CLASS

from memory_cf import MemoryBasedCF


# 3. TRAINING THE CF MODEL

def train_collaborative_filtering_memory(df, user_col, item_col, rating_col):
    """
    Trains a memory-based collaborative filtering model:
      - Creates a pivot table for users and items.
      - Normalizes ratings to be between 0 and 1.
      - Computes cosine similarity.
    Returns an instance of MemoryBasedCF.
    """
    df[user_col] = df[user_col].astype(str)
    df[item_col] = df[item_col].astype(str)
    
    # Normalize the rating column to a [0, 1] scale.
    df[rating_col] = (df[rating_col] - df[rating_col].min()) / (df[rating_col].max() - df[rating_col].min())
    
    df_pivot = df.pivot_table(index=user_col, columns=item_col, values=rating_col, aggfunc='mean', fill_value=0)
    cf_model = MemoryBasedCF(df_pivot)
    print("✅ Memory-Based Collaborative Filtering model trained.")
    return cf_model, df_pivot

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def evaluate_cf_model_precision_recall_accuracy_f1(cf_model, df_pivot, threshold=0.5):
    """
    Evaluates the collaborative filtering model using Precision and Recall based on 
    predicted and actual ratings, considering ratings above a threshold as relevant.
    """
    actual_relevance = []
    predicted_relevance = []
    
    for user in df_pivot.index:
        actual = df_pivot.loc[user].values
        predicted = cf_model.predict(user)
        
        # Convert ratings to relevance (1 if relevant, 0 if not)
        actual_relevance.extend((actual >= threshold).astype(int))
        predicted_relevance.extend((predicted >= threshold).astype(int))
    

    # Compute Precision, Recall, Accuracy, and F1-Score
    precision = precision_score(actual_relevance, predicted_relevance)
    recall = recall_score(actual_relevance, predicted_relevance)
    accuracy = accuracy_score(actual_relevance, predicted_relevance)
    f1 = f1_score(actual_relevance, predicted_relevance)

    print(f"📊 Collaborative Filtering Evaluation Metrics:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return precision, recall, accuracy, f1



cf_model, df_pivot = train_collaborative_filtering_memory(df, 'userName', 'specialization_category', 'admit')


precision, recall, accuracy, f1 = evaluate_cf_model_precision_recall_accuracy_f1(cf_model, df_pivot)

# Save the trained model
joblib.dump(cf_model, "models/major_models/cf_model_spec.pkl")

print("\n✅ Collaborative Filtering model saved successfully.")


✅ Memory-Based Collaborative Filtering model trained.
📊 Collaborative Filtering Evaluation Metrics:
Precision: 0.7365
Recall: 0.8207
Accuracy: 0.9997
F1 Score: 0.7763

✅ Collaborative Filtering model saved successfully.


In [18]:

def get_top_similar_users(user_id, similarity_matrix, top_n=5):
    similar_scores = similarity_matrix.loc[user_id].drop(user_id)  
    top_similar = similar_scores.sort_values(ascending=False).head(top_n)
    return top_similar

def get_top_recommendations(user_id, cf_model, top_n=5):
    predictions = cf_model.predict(user_id)
    sorted_indices = np.argsort(predictions)[::-1]  
    specialization_names = df_pivot.columns[sorted_indices][:top_n]
    scores = predictions[sorted_indices][:top_n]
    return list(zip(specialization_names, scores))

def format_similar_users_table(user_id, similarity_matrix, user_id_map, top_n=5):
    top_similar = get_top_similar_users(user_id, similarity_matrix, top_n)
    return pd.DataFrame({
        'Target User': user_id_map[user_id],
        'Similar User': [user_id_map[u] for u in top_similar.index],
        'Cosine Similarity': top_similar.values
    })

def format_top_recommendations_table(user_id, cf_model, top_n=5):
    top_recs = get_top_recommendations(user_id, cf_model, top_n)
    return pd.DataFrame(top_recs, columns=['Recommended Specialization', 'Predicted Score'])


user_id = 'rishik.bazaz'

user_id_map = {user: f"User_{i}" for i, user in enumerate(df_pivot.index)}

# Compute similarity matrix for specialization CF
similarity_matrix = pd.DataFrame(
    cosine_similarity(df_pivot),
    index=df_pivot.index,
    columns=df_pivot.index
)

if user_id in df_pivot.index:
    similar_users_df = format_similar_users_table(user_id, similarity_matrix, user_id_map)
    recommendations_df = format_top_recommendations_table(user_id, cf_model)

    print(f"📌 Sample User: {user_id_map[user_id]}\n")
    print("Top Similar Users:")
    print(similar_users_df.to_string(index=False))

    print("\nTop Specialization Recommendations:")
    print(recommendations_df.to_string(index=False))
else:
    print(f"❌ User ID '{user_id}' not found in dataset.")


📌 Sample User: User_9476

Top Similar Users:
Target User Similar User  Cosine Similarity
  User_9476   User_13071           1.000000
  User_9476    User_1673           0.816497
  User_9476    User_7154           0.816497
  User_9476    User_6208           0.707107
  User_9476   User_13088           0.500000

Top Specialization Recommendations:
       Recommended Specialization  Predicted Score
                   rf engineering         0.412027
            microwave engineering         0.254098
                   circuit design         0.077705
                 electromagnetics         0.064983
integrated circuit ic engineering         0.040760


In [3]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb


# 1. LOAD SAVED MODELS & PREPROCESSING OBJECTS

spec_save_dir = "models/major_models"

# Load content-based models:
rf_model = joblib.load(os.path.join(spec_save_dir, "rf_specialization.pkl"))[0]

xgb_model = xgb.XGBClassifier()
xgb_model.load_model(os.path.join(spec_save_dir, "xgb_specialization.json"))

cf_model = joblib.load(os.path.join(spec_save_dir, "cf_model_spec.pkl"))

label_encoders_spec = joblib.load(os.path.join(spec_save_dir, "label_encoders_specialization.pkl"))
scaler_spec = joblib.load(os.path.join(spec_save_dir, "scaler_specialization.pkl"))
le_y_spec = joblib.load(os.path.join(spec_save_dir, "le_y_spec.pkl"))

print("✅ All specialization models and preprocessing objects loaded successfully.")


# 2. Load the test dataset
df_test = pd.read_csv('./data/categorized_specializations.csv')
df_test = df_test.sample(n=5000, random_state=42)  

df_test.drop(columns=['program', 'ugCollege', 'univName_rank', 'acceptance_rate', 'univ_state'], axis=1, inplace=True)

spec_categorical = ['major']
spec_numerical   = ['researchExp', 'industryExp', 'toeflScore', 'internExp', 'greV', 'greQ', 'greA', 'normalized_cgpa']


for col in spec_categorical:
    le = label_encoders_spec[col]
    df_test[col] = df_test[col].astype(str)
    df_test[col] = le.transform(df_test[col])


df_test[spec_numerical] = df_test[spec_numerical].replace([np.inf, -np.inf], np.nan)
df_test[spec_numerical] = df_test[spec_numerical].fillna(df_test[spec_numerical].mean())
df_test[spec_numerical] = scaler_spec.transform(df_test[spec_numerical])


known_labels = set(le_y_spec.classes_)
df_test = df_test[df_test['specialization_category'].isin(known_labels)]


X_test = df_test[spec_categorical + spec_numerical]
y_test = df_test['specialization_category']



# 3. DEFINE HYBRID SPECIALIZATION RECOMMENDATION FUNCTION

def hybrid_specialization_recommendation(input_df, rf_model, xgb_model, cf_model, le_y):
    input_features = input_df.drop(columns=['userName'], errors='ignore')

    rf_probs = rf_model.predict_proba(input_features)[0]
    xgb_probs = xgb_model.predict_proba(input_features)[0]
    content_probs = (rf_probs + xgb_probs) / 2

    collab_probs = np.zeros_like(content_probs)

    if 'userName' in input_df.columns:
        input_user = input_df.iloc[0]['userName']
        cf_raw_probs = cf_model.predict(input_user)

        # Align CF output with label encoder classes
        cf_items = cf_model.item_ids
        cf_prob_dict = dict(zip(cf_items, cf_raw_probs))

        aligned_cf_probs = []
        for class_label in le_y.classes_:
            aligned_cf_probs.append(cf_prob_dict.get(class_label, 0.0))

        collab_probs = np.array(aligned_cf_probs)

    # Combine with variance-based weighting
    alpha = np.var(content_probs) / (np.var(content_probs) + np.var(collab_probs) + 1e-5)
    final_probs = (content_probs * alpha) + (collab_probs * (1 - alpha))

    top_n = 5
    top_indices = np.argsort(final_probs)[-top_n:][::-1]
    recommendations = le_y.inverse_transform(top_indices)
    return recommendations


# 4. EVALUATE THE HYBRID MODEL

def evaluate_hybrid_specialization(X_test, y_test, rf_model, xgb_model, cf_model, le_y):
    """
    Evaluates the hybrid specialization recommender by:
      - Generating recommendations for each test sample.
      - Taking the top recommendation as the predicted specialization.
      - Comparing against the true specialization label.
    Prints the overall accuracy and a detailed classification report.
    """
    y_test_encoded = le_y.transform(y_test)
    y_pred = []
    
    for i in range(len(X_test)):
        sample = X_test.iloc[[i]].copy()

        if 'userName' not in sample.columns:
            sample['userName'] = "dummy_user"
        
        recs = hybrid_specialization_recommendation(sample, rf_model, xgb_model, cf_model, le_y)

        if len(recs) > 0:
            pred = le_y.transform([recs[0]])[0]
        else:
            pred = -1  
        y_pred.append(pred)
    

    valid_idx = [i for i, p in enumerate(y_pred) if p != -1]
    y_true_valid = np.array([y_test_encoded[i] for i in valid_idx])
    y_pred_valid = np.array([y_pred[i] for i in valid_idx])
    
    acc = accuracy_score(y_true_valid, y_pred_valid)
    print("📊 Hybrid Specialization Model Evaluation:")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_true_valid, y_pred_valid))
    
# Run the evaluation:
evaluate_hybrid_specialization(X_test, y_test, rf_model, xgb_model, cf_model, le_y_spec)


✅ All specialization models and preprocessing objects loaded successfully.
📊 Hybrid Specialization Model Evaluation:
Accuracy: 0.8203
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.83      1.00      0.91        10
           2       0.73      0.51      0.60        43
           3       0.33      0.50      0.40         2
           4       0.80      1.00      0.89         4
           5       1.00      1.00      1.00         1
           6       0.81      1.00      0.90        13
           7       0.36      1.00      0.53         4
           8       0.94      0.57      0.71       118
           9       0.43      0.75      0.55         4
          10       0.33      1.00      0.50         1
          11       1.00      0.80      0.89         5
          12       0.76      0.81      0.79        16
          13       0.57      0.92      0.71        13
          14       1.00      0.80      0.89         5
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import os
import joblib
import shutil


from memory_cf import MemoryBasedCF


source_dir = "models/major_models"
compressed_dir = "models/major_models_compressed"
os.makedirs(compressed_dir, exist_ok=True)


print("🔄 Loading specialization models...")

rf_model = joblib.load(os.path.join(source_dir, "rf_specialization.pkl"))
cf_model = joblib.load(os.path.join(source_dir, "cf_model_spec.pkl"))
scaler = joblib.load(os.path.join(source_dir, "scaler_specialization.pkl"))
label_encoders = joblib.load(os.path.join(source_dir, "label_encoders_specialization.pkl"))
le_y = joblib.load(os.path.join(source_dir, "le_y_spec.pkl"))

print("✅ Models loaded.")


print("💾 Saving compressed model files...")

joblib.dump(rf_model, os.path.join(compressed_dir, "rf_specialization.pkl"), compress=3)
joblib.dump(cf_model, os.path.join(compressed_dir, "cf_model_spec.pkl"), compress=3)
joblib.dump(scaler, os.path.join(compressed_dir, "scaler_specialization.pkl"), compress=3)
joblib.dump(label_encoders, os.path.join(compressed_dir, "label_encoders_specialization.pkl"), compress=3)
joblib.dump(le_y, os.path.join(compressed_dir, "le_y_spec.pkl"), compress=3)


print("📁 Copying xgb_specialization.json to compressed directory...")
shutil.copyfile(
    os.path.join(source_dir, "xgb_specialization.json"),
    os.path.join(compressed_dir, "xgb_specialization.json")
)

print("✅ All specialization models saved and copied to 'major_models_compressed'.")


🔄 Loading specialization models...
✅ Models loaded.
💾 Saving compressed model files...
📁 Copying xgb_specialization.json to compressed directory...
✅ All specialization models saved and copied to 'major_models_compressed'.
