In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --------------------------
# 1. LOAD THE DATASET
# --------------------------
df = pd.read_csv('./data/categorized_specializations.csv')

# Handle skew by removing low-count university names
min_samples_per_class = 50
valid_classes = df['univName'].value_counts()
valid_classes = valid_classes[valid_classes >= min_samples_per_class].index
df = df[df['univName'].isin(valid_classes)]
# One-hot encoding univ_state 
df = pd.get_dummies(df, columns=['univ_state'], drop_first=True)

# Label encoding categorical columns
def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

# Scaling continuous columns
def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler

# --------------------------
# 2. MEMORY-BASED COLLABORATIVE FILTERING CLASS
# --------------------------
class MemoryBasedCF:
    """
    Memory-based Collaborative Filtering using cosine similarity.
    The model is built on a pivot table of users and items (universities) with normalized ratings.
    """
    def __init__(self, pivot):
        self.pivot = pivot
        # Compute cosine similarity between all users in the pivot table.
        self.similarity = cosine_similarity(pivot)
        self.user_ids = list(pivot.index)
        self.item_ids = list(pivot.columns)

    def predict(self, input_user):
        """
        Given a user name, computes a weighted average of ratings from similar users.
        Returns a probability distribution over items.
        """
        if input_user in self.user_ids:
            idx = self.user_ids.index(input_user)
            sim_scores = self.similarity[idx]
        else:
            # If user not found, assume equal similarity.
            sim_scores = np.ones(len(self.user_ids)) / len(self.user_ids)
        
        # Weighted sum of ratings from similar users.
        weighted_sum = np.dot(sim_scores, self.pivot.values)
        # Normalize the predictions
        if weighted_sum.sum() > 0:
            preds = weighted_sum / weighted_sum.sum()
        else:
            preds = weighted_sum
        return preds

# --------------------------
# 3. TRAINING THE CF MODEL
# --------------------------
def train_collaborative_filtering_memory(df, user_col, item_col, rating_col):
    """
    Trains a memory-based collaborative filtering model:
      - Creates a pivot table for users and items.
      - Normalizes ratings to be between 0 and 1.
      - Computes cosine similarity.
    Returns an instance of MemoryBasedCF.
    """
    df[user_col] = df[user_col].astype(str)
    df[item_col] = df[item_col].astype(str)
    
    # Normalize the rating column to a [0, 1] scale.
    df[rating_col] = (df[rating_col] - df[rating_col].min()) / (df[rating_col].max() - df[rating_col].min())
    
    df_pivot = df.pivot_table(index=user_col, columns=item_col, values=rating_col, aggfunc='mean', fill_value=0)
    cf_model = MemoryBasedCF(df_pivot)
    print("✅ Memory-Based Collaborative Filtering model trained.")
    return cf_model, df_pivot

# --------------------------
# 4. EVALUATION METRICS
# --------------------------
def evaluate_cf_model(cf_model, df_pivot):
    """
    Evaluates the collaborative filtering model using Mean Absolute Error (MAE) and 
    Root Mean Square Error (RMSE) based on the predicted and actual ratings.
    """
    actual_ratings = []
    predicted_ratings = []
    
    for user in df_pivot.index:
        actual = df_pivot.loc[user].values
        predicted = cf_model.predict(user)
        
        # Store values for error calculation
        actual_ratings.extend(actual)
        predicted_ratings.extend(predicted)
    
    # Compute MAE and RMSE
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    print(f"📊 Collaborative Filtering Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
    
    return mae, rmse

# --------------------------
# 5. RUN TRAINING & EVALUATION
# --------------------------
cf_model, df_pivot = train_collaborative_filtering_memory(df, 'userName', 'univName', 'admit')

# Evaluate the CF model
mae, rmse = evaluate_cf_model(cf_model, df_pivot)

# Save the trained model
joblib.dump(cf_model, "models/cf_model_memory.pkl")

print("\n✅ Collaborative Filtering model saved successfully.")


✅ Memory-Based Collaborative Filtering model trained.
📊 Collaborative Filtering Evaluation Metrics:
Mean Absolute Error (MAE): 0.0381
Root Mean Square Error (RMSE): 0.1542

✅ Collaborative Filtering model saved successfully.


In [13]:
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# --------------------------
# 1. LOAD SAVED MODELS & PREPROCESSING OBJECTS
# --------------------------
save_dir = "models/university_models"

# Load the trained Random Forest model (saved as a dict; we extract the model from the tuple)
models_univ = joblib.load(os.path.join(save_dir, 'rf_university.pkl'))
rf_model = models_univ['Random Forest'][0]  # content-based model

# Load collaborative filtering model (memory-based CF)
cf_model = joblib.load(os.path.join(save_dir, 'cf_model_memory.pkl'))

# Load preprocessing objects
scaler_univ = joblib.load(os.path.join(save_dir, 'scaler_university.pkl'))
label_encoders_univ = joblib.load(os.path.join(save_dir, 'label_encoders_university.pkl'))
le_y_univ = joblib.load(os.path.join(save_dir, 'le_y_univ.pkl'))
one_hot_columns = joblib.load(os.path.join(save_dir, "one_hot_columns_university.pkl"))

print("✅ All models and preprocessing objects loaded successfully.")

# Define the feature set used during training:
univ_categorical = ['ugCollege', 'specialization_category']
univ_numerical   = ['toeflScore', 'greV', 'greQ', 'greA', 'normalized_cgpa']

# --------------------------
# 2. HYBRID RECOMMENDER FUNCTION
# --------------------------
def hybrid_university_recommendation(input_df, content_model, collab_model, label_encoder, one_hot_columns):
    """
    Generates university recommendations by combining:
      - Content-based predictions (using the Random Forest classifier).
      - Collaborative filtering predictions (using memory-based CF).
    Returns the top 3 recommended universities (decoded).
    """
    # Remove "userName" column if present (content model was trained without it)
    input_features = input_df.drop(columns=['userName'], errors='ignore')
    
    # --- Content-based predictions ---
    try:
        content_probs = content_model.predict_proba(input_features)[0]
    except Exception:
        content_probs = content_model.predict(input_features)
    
    # --- Collaborative filtering predictions ---
    # For CF, we use the userName if present; otherwise, use zeros.
    if 'userName' in input_df.columns:
        input_user = input_df.iloc[0]['userName']
        collab_probs = collab_model.predict(input_user)
    else:
        collab_probs = np.zeros_like(content_probs)
    
    # --- Combine predictions using variance-based weighting ---
    alpha = np.var(content_probs) / (np.var(content_probs) + np.var(collab_probs) + 1e-5)
    final_probs = (content_probs * alpha) + (collab_probs * (1 - alpha))
    
    # Select top 3 recommendations
    top_n = 3
    top_indices = np.argsort(final_probs)[-top_n:][::-1]
    recommendations = label_encoder.inverse_transform(top_indices)
    
    return recommendations

# --------------------------
# 3. EVALUATION FUNCTION FOR THE HYBRID MODEL
# --------------------------
def evaluate_hybrid_model(X_test, y_test, rf_model, cf_model, le_y, one_hot_columns):
    """
    Evaluates the hybrid recommender system by:
      - For each test sample, obtaining hybrid recommendations.
      - Taking the top recommendation as the predicted label.
      - Comparing against the true label.
    Prints accuracy and a classification report.
    """
    y_test_encoded = le_y.transform(y_test)
    y_pred = []
    
    for i in range(len(X_test)):
        sample = X_test.iloc[[i]].copy()
        # Add a dummy "userName" column (required for CF) if not present.
        if 'userName' not in sample.columns:
            sample['userName'] = "dummy_user"
        
        recs = hybrid_university_recommendation(sample, rf_model, cf_model, le_y, one_hot_columns)
        # Use the top recommendation as the prediction
        if len(recs) > 0:
            pred = le_y.transform([recs[0]])[0]
        else:
            pred = -1
        y_pred.append(pred)
    
    # Remove invalid predictions if any
    valid_idx = [i for i, p in enumerate(y_pred) if p != -1]
    y_true_valid = np.array([y_test_encoded[i] for i in valid_idx])
    y_pred_valid = np.array([y_pred[i] for i in valid_idx])
    
    acc = accuracy_score(y_true_valid, y_pred_valid)
    print("\n📊 Hybrid Model Evaluation Results:")
    print(f"Hybrid Model Accuracy: {acc:.4f}")
    print(classification_report(y_true_valid, y_pred_valid))

# --------------------------
# 4. PREPROCESS TEST DATA
# --------------------------
df_test = pd.read_csv('./data/categorized_specializations.csv')

# Filter out universities with too few samples (as in training)
min_samples_per_class = 50
valid_classes = df_test['univName'].value_counts()
valid_classes = valid_classes[valid_classes >= min_samples_per_class].index
df_test = df_test[df_test['univName'].isin(valid_classes)]

# --- Apply label encoding for categorical columns ---
def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler

df_test, _ = encode_categorical_columns(df_test, univ_categorical)
df_test, _ = scale_numerical_columns(df_test, univ_numerical)
# One-hot encode the "univ_state" column.
df_test = pd.get_dummies(df_test, columns=['univ_state'], drop_first=True)
# Ensure all one-hot columns from training exist in test data.
for col in one_hot_columns:
    if col not in df_test.columns:
        df_test[col] = 0
# Reorder columns to match training features.
X_test = df_test[univ_categorical + univ_numerical + one_hot_columns]
y_test = df_test['univName']

# --------------------------
# 5. EVALUATE THE HYBRID RECOMMENDER
# --------------------------
evaluate_hybrid_model(X_test, y_test, rf_model, cf_model, le_y_univ, one_hot_columns)


✅ All models and preprocessing objects loaded successfully.

📊 Hybrid Model Evaluation Results:
Hybrid Model Accuracy: 0.8558
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3727
           1       0.54      0.69      0.60        54
           2       0.87      0.86      0.87      1600
           3       1.00      1.00      1.00       933
           4       0.57      0.67      0.62       518
           5       0.65      0.69      0.67       685
           6       0.89      0.93      0.91       461
           7       1.00      1.00      1.00      2368
           8       0.85      0.84      0.85       228
           9       0.79      0.82      0.80       100
          10       0.93      0.96      0.94       545
          11       0.59      0.62      0.61       502
          12       0.94      0.95      0.94      4149
          13       0.93      0.93      0.93      2242
          14       0.79      0.79      0.79       112
         

In [16]:
def evaluate_hybrid_model(X_test, y_test, rf_model, cf_model, le_y, one_hot_columns):
    """
    Evaluates the hybrid recommender system by:
      - For each test sample, obtaining hybrid recommendations.
      - Taking the top recommendation as the predicted label.
      - Comparing against the true label.
    Prints accuracy, classification report, and predictions vs actual values.
    """
    y_test_encoded = le_y.transform(y_test)
    y_pred = []
    predictions_vs_actuals = []

    for i in range(len(X_test)):
        sample = X_test.iloc[[i]].copy()
        # Add a dummy "userName" column (required for CF) if not present.
        if 'userName' not in sample.columns:
            sample['userName'] = "dummy_user"
        
        recs = hybrid_university_recommendation(sample, rf_model, cf_model, le_y, one_hot_columns)
        # Use the top recommendation as the prediction
        if len(recs) > 0:
            pred = le_y.transform([recs[0]])[0]
            pred_univ = recs[0]
        else:
            pred = -1
            pred_univ = "None"
        y_pred.append(pred)
        
        actual_univ = y_test.iloc[i]
        predictions_vs_actuals.append((actual_univ, pred_univ))

    # Remove invalid predictions if any
    valid_idx = [i for i, p in enumerate(y_pred) if p != -1]
    y_true_valid = np.array([y_test_encoded[i] for i in valid_idx])
    y_pred_valid = np.array([y_pred[i] for i in valid_idx])
    
    acc = accuracy_score(y_true_valid, y_pred_valid)
    print("\n📊 Hybrid Model Evaluation Results:")
    print(f"Hybrid Model Accuracy: {acc:.4f}")
    print(classification_report(y_true_valid, y_pred_valid))

    # Print predictions vs actual values
    print("\n🔍 Predictions vs Actual Values:")
    print("Actual University  |  Predicted University")
    print("-" * 50)
    for actual, predicted in predictions_vs_actuals[:20]:  # Print only first 20 for readability
        print(f"{actual:25} | {predicted}")


In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE  
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings("ignore")

############################################
# Collaborative Filtering Functions
############################################

def compute_similarity(data):
    """
    Compute user-user similarity using cosine similarity.
    :param data: Pandas DataFrame where rows are users and columns are university names.
    :return: User-user similarity matrix.
    """
    return pd.DataFrame(cosine_similarity(data), index=data.index, columns=data.index)

def predict_major(user_id, data, similarity_matrix, top_n=1):
    """
    Predict the most likely university for a user based on similar users.
    :param user_id: Index of the user in the DataFrame.
    :param data: Original user-university preference DataFrame.
    :param similarity_matrix: Precomputed user-user similarity matrix.
    :param top_n: Number of top universities to recommend.
    :return: List of recommended universities.
    """
    # Find similar users (excluding the user itself)
    similar_users = similarity_matrix[user_id].drop(user_id).nlargest(5)
    
    # Aggregate university selections among similar users (weighted sum)
    weighted_scores = data.loc[similar_users.index].T.dot(similar_users)
    
    # Recommend top universities based on weighted scores
    recommended_univs = weighted_scores.nlargest(top_n).index.tolist()
    
    return recommended_univs

def build_collaborative_filtering_model(df, user_col, univ_col):
    """
    Build a collaborative filtering model using a pivot table.
    We count the number of times a user selected a university.
    :param df: DataFrame with user and university info.
    :param user_col: Column name for user identifier.
    :param univ_col: Column name for university name.
    :return: similarity_matrix, user_univ_matrix
    """
    # Build pivot table: rows = users, columns = universities; values = counts (or binary indicator)
    user_univ_matrix = pd.crosstab(df[user_col], df[univ_col])
    similarity_matrix = compute_similarity(user_univ_matrix)
    return similarity_matrix, user_univ_matrix

############################################
# Content-Based Preprocessing and Model Training
############################################

def encode_categorical_columns(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

def scale_numerical_columns(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = df[numerical_columns].replace([np.inf, -np.inf], np.nan)
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler



############################################
# Hybrid Recommendation Function
############################################

def hybrid_university_recommendation(user_id, input_df, content_model, similarity_matrix, user_univ_matrix, label_encoder,
                                     is_university=False, state=None, one_hot_columns=None):
    if is_university:
        # --- Content-Based Prediction ---
        try:
            if state is None and one_hot_columns is not None:
                probs_list = []
                for state_col in one_hot_columns:
                    temp_df = input_df.copy()
                    temp_df[one_hot_columns] = 0  # reset all state one-hot columns
                    temp_df[state_col] = 1         # set current state
                    try:
                        probs = content_model.predict_proba(temp_df)[0]
                    except Exception as e:
                        probs = content_model.predict(temp_df)
                    probs_list.append(probs)
                content_probs = np.mean(probs_list, axis=0)
            else:
                try:
                    content_probs = content_model.predict_proba(input_df)[0]
                except Exception as e:
                    content_probs = content_model.predict(input_df)
        except Exception as e:
            print("Error in content model prediction:", e)
            content_probs = np.zeros(len(label_encoder.classes_))
        
        # --- Collaborative Filtering Prediction ---
        try:
            collab_recs = predict_major(user_id, user_univ_matrix, similarity_matrix, top_n=3)
            collab_probs = np.zeros_like(content_probs)
            for rec in collab_recs:
                rec_index = np.where(label_encoder.classes_ == rec)[0]
                if len(rec_index) > 0:
                    collab_probs[rec_index] += 1
            collab_probs = collab_probs / (len(collab_recs) if len(collab_recs) > 0 else 1)
        except Exception as e:
            print("Error in collaborative filtering:", e)
            collab_probs = np.zeros_like(content_probs)
        
        # --- Combine the Two Predictions ---
        alpha = np.var(content_probs) / (np.var(content_probs) + np.var(collab_probs) + 1e-5)
        final_probs = (content_probs * alpha) + (collab_probs * (1 - alpha))
        
        top_n = 3
        top_indices = np.argsort(final_probs)[-top_n:][::-1]
        final_recommendations = label_encoder.inverse_transform(top_indices)
        
        if state is not None:
            # If filtering by state is needed, adjust this filtering step as per your one-hot column naming
            filtered_recommendations = [rec for rec in final_recommendations 
                                        if rec in df[df['univ_state'] == state]['univName'].values]
            if len(filtered_recommendations) < top_n:
                print(f"Only {len(filtered_recommendations)} universities found in {state}. Expanding recommendations.")
                filtered_recommendations = final_recommendations
            return filtered_recommendations
        
        return final_recommendations

############################################
# Main Section
############################################

if __name__ == "__main__":
    # --- Load Dataset ---
    df = pd.read_csv('./data/categorized_specializations.csv')
    
    # Filter to keep only university names with at least 50 samples
    min_samples_per_class = 50
    valid_classes = df['univName'].value_counts()
    valid_classes = valid_classes[valid_classes >= min_samples_per_class].index
    df = df[df['univName'].isin(valid_classes)]
    
    # One-hot encode the "univ_state" column
    df = pd.get_dummies(df, columns=['univ_state'], drop_first=True)
    one_hot_columns = [col for col in df.columns if col.startswith("univ_state_")]
    
    # --- Prepare Data for Content-Based Model ---
    univ_categorical = ['ugCollege', 'specialization_category']
    univ_numerical = ['toeflScore', 'greV', 'greQ', 'greA', 'normalized_cgpa']
    
    df, label_encoders_univ = encode_categorical_columns(df, univ_categorical)
    df, scaler_univ = scale_numerical_columns(df, univ_numerical)
    
    # Build the feature set (include one-hot state columns)
    X_univ = df[univ_categorical + univ_numerical + one_hot_columns]
    y_univ = df['univName']
    
    X_univ_train, X_univ_test, y_univ_train, y_univ_test = train_test_split(
        X_univ, y_univ, test_size=0.2, random_state=42)
    
    # Apply SMOTE on the training set
    smote = SMOTE(random_state=42)
    X_univ_train_resampled, y_univ_train_resampled = smote.fit_resample(X_univ_train, y_univ_train)
    
    # --- Load or Train Content-Based Model ---
    save_dir = "models/university_models"
    os.makedirs(save_dir, exist_ok=True)
    
    try:
        models_univ = joblib.load(os.path.join(save_dir, 'rf_university.pkl'))
        scaler_univ = joblib.load(os.path.join(save_dir, 'scaler_university.pkl'))
        label_encoders_univ = joblib.load(os.path.join(save_dir, 'label_encoders_university.pkl'))
        le_y_univ = joblib.load(os.path.join(save_dir, 'le_y_univ.pkl'))
        print("Loaded saved content-based models.")
    except Exception as e:
        print("Saved models not found. Training content-based model.")
        models_univ, le_y_univ = train_models(X_univ_train_resampled, X_univ_test, y_univ_train_resampled, y_univ_test, 'University')
        joblib.dump(models_univ, os.path.join(save_dir, 'rf_university.pkl'))
        joblib.dump(scaler_univ, os.path.join(save_dir, 'scaler_university.pkl'))
        joblib.dump(label_encoders_univ, os.path.join(save_dir, 'label_encoders_university.pkl'))
        joblib.dump(le_y_univ, os.path.join(save_dir, 'le_y_univ.pkl'))
    
    content_model, _ = models_univ['Random Forest']
    
    # --- Build Collaborative Filtering Model ---
    # Here we build a pivot table based on user selections of "univName"
    similarity_matrix, user_univ_matrix = build_collaborative_filtering_model(df, 'userName', 'univName')
    
    # --- Example Hybrid Recommendation ---
    # Pick a sample user from the collaborative filtering pivot table
    sample_user = user_univ_matrix.index[0]
    # For content-based input, we select one row from the test set
    sample_input = X_univ_test.iloc[[0]]  # keep as a DataFrame
    
    recommendations = hybrid_university_recommendation(
        user_id=sample_user,
        input_df=sample_input,
        content_model=content_model,
        similarity_matrix=similarity_matrix,
        user_univ_matrix=user_univ_matrix,
        label_encoder=le_y_univ,
        is_university=True,
        state=None,
        one_hot_columns=one_hot_columns
    )
    
    print(f"\nRecommended Universities for user '{sample_user}':", recommendations)
    
    # Optionally, save one_hot_columns if needed
    joblib.dump(one_hot_columns, os.path.join(save_dir, "one_hot_columns_university.pkl"))
