In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error


# 1. LOAD THE DATASET

df = pd.read_csv('./data/categorized_specializations.csv')

# Handle skew by removing low-count university names
min_samples_per_class = 50
valid_classes = df['univName'].value_counts()
valid_classes = valid_classes[valid_classes >= min_samples_per_class].index
df = df[df['univName'].isin(valid_classes)]
# One-hot encoding univ_state 
df = pd.get_dummies(df, columns=['univ_state'], drop_first=True)

# Label encoding categorical columns
def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

# Scaling continuous columns
def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler


# 2. MEMORY-BASED COLLABORATIVE FILTERING CLASS

from memory_cf import MemoryBasedCF


# 3. TRAINING THE CF MODEL

def train_collaborative_filtering_memory(df, user_col, item_col, rating_col):
    """
    Trains a memory-based collaborative filtering model:
      - Creates a pivot table for users and items.
      - Normalizes ratings to be between 0 and 1.
      - Computes cosine similarity.
    Returns an instance of MemoryBasedCF.
    """
    df[user_col] = df[user_col].astype(str)
    df[item_col] = df[item_col].astype(str)
    
    # Normalize the rating column to a [0, 1] scale.
    df[rating_col] = (df[rating_col] - df[rating_col].min()) / (df[rating_col].max() - df[rating_col].min())
    
    df_pivot = df.pivot_table(index=user_col, columns=item_col, values=rating_col, aggfunc='mean', fill_value=0)
    cf_model = MemoryBasedCF(df_pivot)
    print("✅ Memory-Based Collaborative Filtering model trained.")
    return cf_model, df_pivot

from sklearn.metrics import precision_score, recall_score
import numpy as np

from sklearn.metrics import accuracy_score, f1_score

def evaluate_cf_model_precision_recall_accuracy_f1(cf_model, df_pivot, threshold=0.1):
    """
    Evaluates the collaborative filtering model using Precision, Recall, Accuracy, and F1-score based on 
    predicted and actual ratings, considering ratings above a threshold as relevant.
    """
    actual_relevance = []
    predicted_relevance = []
    
    for user in df_pivot.index:
        actual = df_pivot.loc[user].values
        predicted = cf_model.predict(user)
        
        # Convert ratings to relevance (1 if relevant, 0 if not)
        actual_relevance.extend((actual >= threshold).astype(int))
        predicted_relevance.extend((predicted >= threshold).astype(int))
    
    # Compute Precision, Recall, Accuracy, and F1-Score
    precision = precision_score(actual_relevance, predicted_relevance)
    recall = recall_score(actual_relevance, predicted_relevance)
    accuracy = accuracy_score(actual_relevance, predicted_relevance)
    f1 = f1_score(actual_relevance, predicted_relevance)

    print(f"📊 Collaborative Filtering Evaluation Metrics:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return precision, recall, accuracy, f1



# 5. RUN TRAINING & EVALUATION WITH PRECISION/RECALL

# Train the collaborative filtering model
cf_model, df_pivot = train_collaborative_filtering_memory(df, 'userName', 'univName', 'admit')

# Evaluate the CF model using Precision, Recall, Accuracy, and F1-Score
precision, recall, accuracy, f1 = evaluate_cf_model_precision_recall_accuracy_f1(cf_model, df_pivot)

# Save the trained model
joblib.dump(cf_model, "models/university_models/cf_model_memory.pkl")

print("\n✅ Collaborative Filtering model saved successfully.")





def get_top_similar_users(user_id, similarity_matrix, top_n=3):
    similar_scores = similarity_matrix.loc[user_id].drop(user_id)  
    top_similar = similar_scores.sort_values(ascending=False).head(top_n)
    return top_similar

def get_top_recommendations(user_id, cf_model, top_n=3):
    predictions = cf_model.predict(user_id)
    sorted_indices = np.argsort(predictions)[::-1]  
    university_names = df_pivot.columns[sorted_indices][:top_n]
    scores = predictions[sorted_indices][:top_n]
    return list(zip(university_names, scores))

def format_similar_users_table(user_id, similarity_matrix, top_n=3):
    top_similar = get_top_similar_users(user_id, similarity_matrix, top_n)
    return pd.DataFrame({
        'Target User': user_id,
        'Similar User': top_similar.index,
        'Cosine Similarity': top_similar.values
    })

def format_top_recommendations_table(user_id, cf_model, top_n=3):
    top_recs = get_top_recommendations(user_id, cf_model, top_n)
    return pd.DataFrame(top_recs, columns=['Recommended University', 'Predicted Score'])


user_id = 'nishanthvasudeva'
similarity_matrix = pd.DataFrame(
    cosine_similarity(df_pivot),
    index=df_pivot.index,
    columns=df_pivot.index
)

if user_id in df_pivot.index:
    similar_users_df = format_similar_users_table(user_id, similarity_matrix)
    recommendations_df = format_top_recommendations_table(user_id, cf_model)

    print(f"📌 Sample User: {user_id}\n")
    print("Top Similar Users:")
    print(similar_users_df)

    print("\nTop University Recommendations:")
    print(recommendations_df)
else:
    print(f"❌ User ID '{user_id}' not found in dataset.")





✅ Memory-Based Collaborative Filtering model trained.
📊 Collaborative Filtering Evaluation Metrics:
Precision: 0.9759
Recall: 0.7358
Accuracy: 0.9898
F1 Score: 0.8390

✅ Collaborative Filtering model saved successfully.
📌 Sample User: nishanthvasudeva

Top Similar Users:
        Target User Similar User  Cosine Similarity
0  nishanthvasudeva      sharvin                1.0
1  nishanthvasudeva  suhaibsiraj                1.0
2  nishanthvasudeva   janhavi172                1.0

Top University Recommendations:
          Recommended University  Predicted Score
0  university of texas arlington         0.492752
1         wayne state university         0.109585
2     university of texas dallas         0.089681


In [8]:

def get_top_similar_users(user_id, similarity_matrix, top_n=3):
    similar_scores = similarity_matrix.loc[user_id].drop(user_id)  
    top_similar = similar_scores.sort_values(ascending=False).head(top_n)
    return top_similar

def get_top_recommendations(user_id, cf_model, top_n=3):
    predictions = cf_model.predict(user_id)
    sorted_indices = np.argsort(predictions)[::-1]  
    university_names = df_pivot.columns[sorted_indices][:top_n]
    scores = predictions[sorted_indices][:top_n]
    return list(zip(university_names, scores))

def format_similar_users_table(user_id, similarity_matrix, user_id_map, top_n=3):
    top_similar = get_top_similar_users(user_id, similarity_matrix, top_n)
    return pd.DataFrame({
        'Target User': user_id_map[user_id],
        'Similar User': [user_id_map[u] for u in top_similar.index],
        'Cosine Similarity': top_similar.values
    })

def format_top_recommendations_table(user_id, cf_model, top_n=3):
    top_recs = get_top_recommendations(user_id, cf_model, top_n)
    return pd.DataFrame(top_recs, columns=['Recommended University', 'Predicted Score'])


# Sample user for university recommendation
user_id = 'gvr'

# Create anonymized ID mapping for users in df_pivot
user_id_map = {user: f"User_{i}" for i, user in enumerate(df_pivot.index)}

# Compute similarity matrix for university CF
similarity_matrix = pd.DataFrame(
    cosine_similarity(df_pivot),
    index=df_pivot.index,
    columns=df_pivot.index
)

if user_id in df_pivot.index:
    similar_users_df = format_similar_users_table(user_id, similarity_matrix, user_id_map)
    recommendations_df = format_top_recommendations_table(user_id, cf_model)

    print(f"📌 Sample User: {user_id_map[user_id]}\n")
    print("Top Similar Users:")
    print(similar_users_df.to_string(index=False))

    print("\nTop University Recommendations:")
    print(recommendations_df.to_string(index=False))
else:
    print(f"❌ User ID '{user_id}' not found in dataset.")


📌 Sample User: User_4317

Top Similar Users:
Target User Similar User  Cosine Similarity
  User_4317    User_5489           0.866025
  User_4317    User_8276           0.816497
  User_4317   User_10568           0.816497

Top University Recommendations:
           Recommended University  Predicted Score
university of southern california         0.247129
               clemson university         0.087625
         arizona state university         0.064133


In [2]:
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report


# 1. LOAD SAVED MODELS & PREPROCESSING OBJECTS

save_dir = "models/university_models"

# Load the trained Random Forest model 
models_univ = joblib.load(os.path.join(save_dir, 'rf_university.pkl'))
rf_model = models_univ['Random Forest'][0]  

# Load collaborative filtering model (memory-based CF)
cf_model = joblib.load(os.path.join(save_dir, 'cf_model_memory.pkl'))

# Load preprocessing objects
scaler_univ = joblib.load(os.path.join(save_dir, 'scaler_university.pkl'))
label_encoders_univ = joblib.load(os.path.join(save_dir, 'label_encoders_university.pkl'))
le_y_univ = joblib.load(os.path.join(save_dir, 'le_y_univ.pkl'))
one_hot_columns = joblib.load(os.path.join(save_dir, "one_hot_columns_university.pkl"))

print("✅ All models and preprocessing objects loaded successfully.")

# Define the feature set used during training:
univ_categorical = ['ugCollege', 'specialization_category']
univ_numerical   = ['toeflScore', 'greV', 'greQ', 'greA', 'normalized_cgpa']


# 2. HYBRID RECOMMENDER FUNCTION

import numpy as np
def hybrid_university_recommendation(input_df, content_model, collab_model, label_encoder, 
                                     is_university=False, state=None, one_hot_columns=None):
    """
    Generates university recommendations by combining:
      - Content-based predictions (using the Random Forest classifier).
      - Collaborative filtering predictions (using memory-based CF).
    Includes an optional state filter for university recommendations.
    Returns the top 3 recommended universities (decoded).
    """
    input_features = input_df.drop(columns=['userName'], errors='ignore')
    
    # --- Content-based predictions ---
    if is_university:
        if state is None and one_hot_columns is not None:

            probs_list = []
            for state_col in one_hot_columns:
                temp_df = input_features.copy()
                temp_df[one_hot_columns] = 0  
                temp_df[state_col] = 1  
                try:
                    probs = content_model.predict_proba(temp_df)[0]
                except Exception:
                    probs = content_model.predict(temp_df)
                probs_list.append(probs)
            content_probs = np.mean(probs_list, axis=0)
        else:
            # Use the input_df directly if a specific state is selected
            try:
                content_probs = content_model.predict_proba(input_features)[0]
            except Exception:
                content_probs = content_model.predict(input_features)
    else:
        # Default content-based prediction
        try:
            content_probs = content_model.predict_proba(input_features)[0]
        except Exception:
            content_probs = content_model.predict(input_features)
    
    # --- Collaborative filtering predictions ---
    if 'userName' in input_df.columns:
        input_user = input_df.iloc[0]['userName']
        collab_probs = collab_model.predict(input_user)
    else:
        collab_probs = np.zeros_like(content_probs)
    
    # --- Combine predictions using variance-based weighting ---
    alpha = np.var(content_probs) / (np.var(content_probs) + np.var(collab_probs) + 1e-5)
    final_probs = (content_probs * alpha) + (collab_probs * (1 - alpha))
    
    # Select top 3 recommendations
    top_n = 3
    top_indices = np.argsort(final_probs)[-top_n:][::-1]
    recommendations = label_encoder.inverse_transform(top_indices)
    # If a specific state was requested, filter the recommendations accordingly.
    if state is not None:
        filtered_recommendations = [rec for rec in final_recommendations 
                                    if rec in df[df['univ_state'] == state]['univName'].values]
        if len(filtered_recommendations) < top_n:
            st.warning(f"⚠️ Only {len(filtered_recommendations)} universities found in {state}. Expanding recommendations.")
            filtered_recommendations = final_recommendations
        return filtered_recommendations
            
    return recommendations


# 3. EVALUATION FUNCTION FOR THE HYBRID MODEL

def evaluate_hybrid_model(X_test, y_test, rf_model, cf_model, le_y, one_hot_columns):
    """
    Evaluates the hybrid recommender system by:
      - For each test sample, obtaining hybrid recommendations.
      - Taking the top recommendation as the predicted label.
      - Comparing against the true label.
    Prints accuracy and a classification report.
    """
    y_test_encoded = le_y.transform(y_test)
    y_pred = []
    
    for i in range(len(X_test)):
        sample = X_test.iloc[[i]].copy()
        if 'userName' not in sample.columns:
            sample['userName'] = "dummy_user"
        
        recs = hybrid_university_recommendation(sample, rf_model, cf_model, le_y, one_hot_columns)
        # Use the top recommendation as the prediction
        if len(recs) > 0:
            pred = le_y.transform([recs[0]])[0]
        else:
            pred = -1
        y_pred.append(pred)
    
    # Remove invalid predictions if any
    valid_idx = [i for i, p in enumerate(y_pred) if p != -1]
    y_true_valid = np.array([y_test_encoded[i] for i in valid_idx])
    y_pred_valid = np.array([y_pred[i] for i in valid_idx])
    
    acc = accuracy_score(y_true_valid, y_pred_valid)
    print("\n📊 Hybrid Model Evaluation Results:")
    print(f"Hybrid Model Accuracy: {acc:.4f}")
    print(classification_report(y_true_valid, y_pred_valid))


# 4. PREPROCESS TEST DATA

df_test = pd.read_csv('./data/categorized_specializations.csv')

min_samples_per_class = 50
valid_classes = df_test['univName'].value_counts()
valid_classes = valid_classes[valid_classes >= min_samples_per_class].index
df_test = df_test[df_test['univName'].isin(valid_classes)]


def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler

df_test, _ = encode_categorical_columns(df_test, univ_categorical)
df_test, _ = scale_numerical_columns(df_test, univ_numerical)

df_test = pd.get_dummies(df_test, columns=['univ_state'], drop_first=True)

for col in one_hot_columns:
    if col not in df_test.columns:
        df_test[col] = 0

X_test = df_test[univ_categorical + univ_numerical + one_hot_columns]
y_test = df_test['univName']


# 5. EVALUATE THE HYBRID RECOMMENDER

evaluate_hybrid_model(X_test, y_test, rf_model, cf_model, le_y_univ, one_hot_columns)

✅ All models and preprocessing objects loaded successfully.

📊 Hybrid Model Evaluation Results:
Hybrid Model Accuracy: 0.8558
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3727
           1       0.54      0.69      0.60        54
           2       0.87      0.86      0.87      1600
           3       1.00      1.00      1.00       933
           4       0.57      0.67      0.62       518
           5       0.65      0.69      0.67       685
           6       0.89      0.93      0.91       461
           7       1.00      1.00      1.00      2368
           8       0.85      0.84      0.85       228
           9       0.79      0.82      0.80       100
          10       0.93      0.96      0.94       545
          11       0.59      0.62      0.61       502
          12       0.94      0.95      0.94      4149
          13       0.93      0.93      0.93      2242
          14       0.79      0.79      0.79       112
         

In [4]:
import os
import joblib


model_dir = "models/university_models"
compressed_dir = "models/university_models_compressed"
os.makedirs(compressed_dir, exist_ok=True)


print("🔄 Loading existing models...")
rf_model_dict = joblib.load(os.path.join(model_dir, "rf_university.pkl"))
cf_model = joblib.load(os.path.join(model_dir, "cf_model_memory.pkl"))
scaler = joblib.load(os.path.join(model_dir, "scaler_university.pkl"))
label_encoders = joblib.load(os.path.join(model_dir, "label_encoders_university.pkl"))
le_y = joblib.load(os.path.join(model_dir, "le_y_univ.pkl"))
one_hot_columns = joblib.load(os.path.join(model_dir, "one_hot_columns_university.pkl"))

print("✅ Models loaded.")


print("💾 Saving compressed versions...")

joblib.dump(rf_model_dict, os.path.join(compressed_dir, "rf_university.pkl"), compress=9)
joblib.dump(cf_model, os.path.join(compressed_dir, "cf_model_memory.pkl"), compress=9)
joblib.dump(scaler, os.path.join(compressed_dir, "scaler_university.pkl"), compress=9)
joblib.dump(label_encoders, os.path.join(compressed_dir, "label_encoders_university.pkl"), compress=9)
joblib.dump(le_y, os.path.join(compressed_dir, "le_y_univ.pkl"), compress=9)
joblib.dump(one_hot_columns, os.path.join(compressed_dir, "one_hot_columns_university.pkl"), compress=9)

print("✅ All models saved with compression at level 3.")


🔄 Loading existing models...
✅ Models loaded.
💾 Saving compressed versions...
✅ All models saved with compression at level 3.
