In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --------------------------
# 1. LOAD THE DATASET
# --------------------------
df = pd.read_csv('./data/categorized_specializations.csv')

# Handle skew by removing low-count university names
min_samples_per_class = 50
valid_classes = df['univName'].value_counts()
valid_classes = valid_classes[valid_classes >= min_samples_per_class].index
df = df[df['univName'].isin(valid_classes)]
# One-hot encoding univ_state 
df = pd.get_dummies(df, columns=['univ_state'], drop_first=True)

# Label encoding categorical columns
def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

# Scaling continuous columns
def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler

# --------------------------
# 2. MEMORY-BASED COLLABORATIVE FILTERING CLASS
# --------------------------
class MemoryBasedCF:
    """
    Memory-based Collaborative Filtering using cosine similarity.
    The model is built on a pivot table of users and items (universities) with normalized ratings.
    """
    def __init__(self, pivot):
        self.pivot = pivot
        # Compute cosine similarity between all users in the pivot table.
        self.similarity = cosine_similarity(pivot)
        self.user_ids = list(pivot.index)
        self.item_ids = list(pivot.columns)

    def predict(self, input_user):
        """
        Given a user name, computes a weighted average of ratings from similar users.
        Returns a probability distribution over items.
        """
        if input_user in self.user_ids:
            idx = self.user_ids.index(input_user)
            sim_scores = self.similarity[idx]
        else:
            # If user not found, assume equal similarity.
            sim_scores = np.ones(len(self.user_ids)) / len(self.user_ids)
        
        # Weighted sum of ratings from similar users.
        weighted_sum = np.dot(sim_scores, self.pivot.values)
        # Normalize the predictions
        if weighted_sum.sum() > 0:
            preds = weighted_sum / weighted_sum.sum()
        else:
            preds = weighted_sum
        return preds

# --------------------------
# 3. TRAINING THE CF MODEL
# --------------------------
def train_collaborative_filtering_memory(df, user_col, item_col, rating_col):
    """
    Trains a memory-based collaborative filtering model:
      - Creates a pivot table for users and items.
      - Normalizes ratings to be between 0 and 1.
      - Computes cosine similarity.
    Returns an instance of MemoryBasedCF.
    """
    df[user_col] = df[user_col].astype(str)
    df[item_col] = df[item_col].astype(str)
    
    # Normalize the rating column to a [0, 1] scale.
    df[rating_col] = (df[rating_col] - df[rating_col].min()) / (df[rating_col].max() - df[rating_col].min())
    
    df_pivot = df.pivot_table(index=user_col, columns=item_col, values=rating_col, aggfunc='mean', fill_value=0)
    cf_model = MemoryBasedCF(df_pivot)
    print("✅ Memory-Based Collaborative Filtering model trained.")
    return cf_model, df_pivot

# --------------------------
# 4. EVALUATION METRICS
# --------------------------
def evaluate_cf_model(cf_model, df_pivot):
    """
    Evaluates the collaborative filtering model using Mean Absolute Error (MAE) and 
    Root Mean Square Error (RMSE) based on the predicted and actual ratings.
    """
    actual_ratings = []
    predicted_ratings = []
    
    for user in df_pivot.index:
        actual = df_pivot.loc[user].values
        predicted = cf_model.predict(user)
        
        # Store values for error calculation
        actual_ratings.extend(actual)
        predicted_ratings.extend(predicted)
    
    # Compute MAE and RMSE
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    print(f"📊 Collaborative Filtering Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
    
    return mae, rmse

# --------------------------
# 5. RUN TRAINING & EVALUATION
# --------------------------
cf_model, df_pivot = train_collaborative_filtering_memory(df, 'userName', 'univName', 'admit')

# Evaluate the CF model
mae, rmse = evaluate_cf_model(cf_model, df_pivot)

# Save the trained model
joblib.dump(cf_model, "models/cf_model_memory.pkl")

print("\n✅ Collaborative Filtering model saved successfully.")


✅ Memory-Based Collaborative Filtering model trained.
📊 Collaborative Filtering Evaluation Metrics:
Mean Absolute Error (MAE): 0.0381
Root Mean Square Error (RMSE): 0.1542

✅ Collaborative Filtering model saved successfully.


In [None]:
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# --------------------------
# 1. LOAD SAVED MODELS & PREPROCESSING OBJECTS
# --------------------------
save_dir = "models/university_models"

# Load the trained Random Forest model (saved as a dict; we extract the model from the tuple)
models_univ = joblib.load(os.path.join(save_dir, 'rf_university.pkl'))
rf_model = models_univ['Random Forest'][0]  # content-based model

# Load collaborative filtering model (memory-based CF)
cf_model = joblib.load(os.path.join(save_dir, 'cf_model_memory.pkl'))

# Load preprocessing objects
scaler_univ = joblib.load(os.path.join(save_dir, 'scaler_university.pkl'))
label_encoders_univ = joblib.load(os.path.join(save_dir, 'label_encoders_university.pkl'))
le_y_univ = joblib.load(os.path.join(save_dir, 'le_y_univ.pkl'))
one_hot_columns = joblib.load(os.path.join(save_dir, "one_hot_columns_university.pkl"))

print("✅ All models and preprocessing objects loaded successfully.")

# Define the feature set used during training:
univ_categorical = ['ugCollege', 'specialization_category']
univ_numerical   = ['toeflScore', 'greV', 'greQ', 'greA', 'normalized_cgpa']

# --------------------------
# 2. HYBRID RECOMMENDER FUNCTION
# --------------------------
def hybrid_university_recommendation(input_df, content_model, collab_model, label_encoder, one_hot_columns):
    """
    Generates university recommendations by combining:
      - Content-based predictions (using the Random Forest classifier).
      - Collaborative filtering predictions (using memory-based CF).
    Returns the top 3 recommended universities (decoded).
    """
    # Remove "userName" column if present (content model was trained without it)
    input_features = input_df.drop(columns=['userName'], errors='ignore')
    
    # --- Content-based predictions ---
    try:
        content_probs = content_model.predict_proba(input_features)[0]
    except Exception:
        content_probs = content_model.predict(input_features)
    
    # --- Collaborative filtering predictions ---
    # For CF, we use the userName if present; otherwise, use zeros.
    if 'userName' in input_df.columns:
        input_user = input_df.iloc[0]['userName']
        collab_probs = collab_model.predict(input_user)
    else:
        collab_probs = np.zeros_like(content_probs)
    
    # --- Combine predictions using variance-based weighting ---
    alpha = np.var(content_probs) / (np.var(content_probs) + np.var(collab_probs) + 1e-5)
    final_probs = (content_probs * alpha) + (collab_probs * (1 - alpha))
    
    # Select top 3 recommendations
    top_n = 3
    top_indices = np.argsort(final_probs)[-top_n:][::-1]
    recommendations = label_encoder.inverse_transform(top_indices)
    
    return recommendations

# --------------------------
# 3. EVALUATION FUNCTION FOR THE HYBRID MODEL
# --------------------------
def evaluate_hybrid_model(X_test, y_test, rf_model, cf_model, le_y, one_hot_columns):
    """
    Evaluates the hybrid recommender system by:
      - For each test sample, obtaining hybrid recommendations.
      - Taking the top recommendation as the predicted label.
      - Comparing against the true label.
    Prints accuracy and a classification report.
    """
    y_test_encoded = le_y.transform(y_test)
    y_pred = []
    
    for i in range(len(X_test)):
        sample = X_test.iloc[[i]].copy()
        # Add a dummy "userName" column (required for CF) if not present.
        if 'userName' not in sample.columns:
            sample['userName'] = "dummy_user"
        
        recs = hybrid_university_recommendation(sample, rf_model, cf_model, le_y, one_hot_columns)
        # Use the top recommendation as the prediction
        if len(recs) > 0:
            pred = le_y.transform([recs[0]])[0]
        else:
            pred = -1
        y_pred.append(pred)
    
    # Remove invalid predictions if any
    valid_idx = [i for i, p in enumerate(y_pred) if p != -1]
    y_true_valid = np.array([y_test_encoded[i] for i in valid_idx])
    y_pred_valid = np.array([y_pred[i] for i in valid_idx])
    
    acc = accuracy_score(y_true_valid, y_pred_valid)
    print("\n📊 Hybrid Model Evaluation Results:")
    print(f"Hybrid Model Accuracy: {acc:.4f}")
    print(classification_report(y_true_valid, y_pred_valid))

# --------------------------
# 4. PREPROCESS TEST DATA
# --------------------------
df_test = pd.read_csv('./data/categorized_specializations.csv')

# Filter out universities with too few samples (as in training)
min_samples_per_class = 50
valid_classes = df_test['univName'].value_counts()
valid_classes = valid_classes[valid_classes >= min_samples_per_class].index
df_test = df_test[df_test['univName'].isin(valid_classes)]

# --- Apply label encoding for categorical columns ---
def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler

df_test, _ = encode_categorical_columns(df_test, univ_categorical)
df_test, _ = scale_numerical_columns(df_test, univ_numerical)
# One-hot encode the "univ_state" column.
df_test = pd.get_dummies(df_test, columns=['univ_state'], drop_first=True)
# Ensure all one-hot columns from training exist in test data.
for col in one_hot_columns:
    if col not in df_test.columns:
        df_test[col] = 0
# Reorder columns to match training features.
X_test = df_test[univ_categorical + univ_numerical + one_hot_columns]
y_test = df_test['univName']

# --------------------------
# 5. EVALUATE THE HYBRID RECOMMENDER
# --------------------------
evaluate_hybrid_model(X_test, y_test, rf_model, cf_model, le_y_univ, one_hot_columns)


✅ All models and preprocessing objects loaded successfully.
