In [64]:

# to do all the loss calculations, since automatic gradients are needed
import numpy as np

# Use helper packages
from AbstractBaseCollabFilterSGD import AbstractBaseCollabFilterSGD
from train_valid_test_loader import load_train_valid_test_datasets

from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate

# Some packages you might need (uncomment as necessary)
import pandas as pd
import matplotlib.pyplot as plt
print("hi")

hi


In [65]:
## Load the entire dev set in surprise's format
reader = Reader(
    line_format='user item rating', sep=',',
    rating_scale=(1, 5), skip_lines=1)

train_set = Dataset.load_from_file(
    '../data_movie_lens_100k/ratings_all_development_set.csv', reader=reader)

train_set = train_set.build_full_trainset()

# Use the SVD algorithm
n_factors = 50
## Fit model like our M3
model = SVD(n_factors=n_factors)
model.fit(train_set)

print("global mean:")
print(model.trainset.global_mean)
print("shape of bias_per_item: ")
print(model.bi.shape)
print("shape of bias_per_user: ")
print(model.bu.shape)
print("shape of U (per user vectors): ")
print(model.pu.shape) # pu is the user vector (ui)
print("shape of V (per item vectors): ")
print(model.qi.shape) # qi is the item vector (uv)


# 
user_info_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movie_info_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
movie_info_df = movie_info_df.drop('title', axis=1)

print("user info: ", user_info_df.shape)
print("movie shape: ", movie_info_df.shape)

# compare shapes

rated_movies = train_set.all_items()
rated_movie_ids = [int(train_set.to_raw_iid(i)) for i in rated_movies]  # Convert to raw ids

# Keep only movies in movie_info_df that have ratings
clean_movie_info_df = movie_info_df[movie_info_df['item_id'].isin(rated_movie_ids)]
print(f"Movies with ratings retained: {clean_movie_info_df.shape}")


# rename it so it's usable
users_U = model.pu
movies_V = model.qi
user_features = user_info_df
movie_features = clean_movie_info_df

ratings_y = train_set.all_ratings


global mean:
3.529480398257623
shape of bias_per_item: 
(1662,)
shape of bias_per_user: 
(943,)
shape of U (per user vectors): 
(943, 10)
shape of V (per item vectors): 
(1662, 10)
user info:  (943, 4)
movie shape:  (1681, 3)
Movies with ratings retained: (1662, 3)


In [60]:
# pipeline:

# 
# collapse U, V, user_feature, item_feature into one matrix to train

# put U and user_features together:
# Prepare user features
user_ids = list(range(0, 942))  # list of all user inner ids
user_features_list = []
for inner_id in user_ids:
    user_vector = users_U[inner_id]
    user_id = model.trainset.to_raw_uid(inner_id)
    # drop user IDs and original user IDs that aren't needed
    user_meta = user_info_df.loc[user_info_df['user_id'] == int(user_id)].drop(['user_id', 'orig_user_id'], axis=1)
    if not user_meta.empty:
        combined_features = np.concatenate([user_vector, user_meta.iloc[0].values])
        user_features_list.append(combined_features)

# Convert to DataFrame
user_features_df = pd.DataFrame(user_features_list, columns=[f'feature_{i}' for i in range(len(user_features_list[0]))])

# put V and item_features together:
# Prepare item features
item_ids = list(range(model.trainset.n_items))  # list of all item inner ids
item_features_list = []
for inner_id in item_ids:
    item_vector = model.qi[inner_id]
    item_id = model.trainset.to_raw_iid(inner_id)
    item_meta = movie_features.loc[movie_features['item_id'] == int(item_id)].drop(['item_id'], axis=1)
    if not item_meta.empty:
        combined_features = np.concatenate([item_vector, item_meta.iloc[0].values])
        item_features_list.append(combined_features)

# Convert to DataFrame
item_features_df = pd.DataFrame(item_features_list, columns=[f'feature_{i}' for i in range(len(item_features_list[0]))])



# after that for predcitions:

# on prediction:
    # take in user_id, item_id
    # get U and V from SVG algorithm 
    # get corresponding features of users/items
# condense the features and put into classifier for prediction

# get output of at least 4.5



In [61]:
# Generate user-item pairs with known ratings from your ratings dataset
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')

# Merge user and item features into the ratings DataFrame
ratings_df = ratings_df.merge(user_features_df, left_on='user_id', right_index=True, how='left')
ratings_df = ratings_df.merge(item_features_df, left_on='item_id', right_index=True, how='left')

print(ratings_df[0:5])

# Finding rows where any column has NaN

# Substitute NaN values with 0 in the entire DataFrame
ratings_df.fillna(0, inplace=True)

ratings_df.fillna(0, inplace=True)
cleaned_ratings = ratings_df
# Optionally, check the first few rows to confirm the substitution
print(ratings_df.head(5))

   user_id  item_id  rating  feature_0_x  feature_1_x  feature_2_x  \
0      772       36       3     0.198406     0.087267     0.263516   
1      471      228       5     0.033688    -0.252558     0.047557   
2      641      401       4    -0.250974     0.209431    -0.011736   
3      312       98       4     0.211324    -0.025986    -0.082683   
4       58      504       5     0.162518    -0.089530     0.010671   

   feature_3_x  feature_4_x  feature_5_x  feature_6_x  ...  feature_2_y  \
0     0.131625     0.009342    -0.024939    -0.203090  ...    -0.003710   
1    -0.111557     0.042743     0.105481     0.098320  ...    -0.091628   
2    -0.070156     0.300620    -0.083992     0.137875  ...     0.068582   
3     0.069485     0.249209    -0.079235     0.016633  ...     0.040237   
4    -0.125048     0.130092    -0.067862     0.023112  ...     0.123669   

   feature_3_y  feature_4_y  feature_5_y  feature_6_y  feature_7_y  \
0     0.029526     0.022848    -0.108267     0.107757    -

In [62]:
# process yo data here
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

feature_cols = [col for col in ratings_df.columns if col.startswith('feature')]

# Prepare data
X = cleaned_ratings[feature_cols]  # all your feature columns
y = cleaned_ratings['rating']      # or a binary column if you're classifying

y = (cleaned_ratings['rating'] >= 4.5).astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


# Create a scaler object
scaler = StandardScaler()

# Fit on training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X = scaler.fit_transform(X)


Neural Network

In [33]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import AUC
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.metrics import balanced_accuracy_score


input_shape = X.shape[1] 

def create_model(input_shape):
    # Define a neural network model
    model = tf.keras.Sequential([
        Input((input_shape,)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(curve='ROC', name='auroc')])
    return model

model = create_model(input_shape)
# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
print("Class weights dictionary:", class_weights)
class_weight_dict = dict(enumerate(class_weights))


auc_scores = []
acc_scores = []
train_acc_scores = []
fold_no = 1
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Create a new model (important to avoid leakage from previous iterations)
    model = create_model(X.shape[1])
    sample_weights = compute_sample_weight(class_weight=class_weight_dict, y=y_train)
    # Train model
    print(f'Training for fold {fold_no} ...')
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, sample_weight=sample_weights)

    # calc balanced accuracy
    y_pred_probs = model.predict(X_test)
    y_train_pred_probs = model.predict(X_train)
    y_pred = (y_pred_probs > 0.5).astype(int)
    y_pred_train = (y_train_pred_probs > 0.5).astype(int)

    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    train_balanced_accuracy = balanced_accuracy_score(y_train, y_pred_train)
    acc_scores.append(balanced_accuracy)
    train_acc_scores.append(train_balanced_accuracy)
    # Evaluate model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[1]} of {scores[1]*100}%')
    auc_scores.append(scores[1])
    fold_no += 1

print(np.mean(auc_scores))
print(np.mean(acc_scores))
print(np.mean(train_acc_scores))

Class weights dictionary: [0.63437192 2.36050782]
Training for fold 1 ...
Epoch 1/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 444us/step - auroc: 0.5377 - loss: 0.6957
Epoch 2/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 417us/step - auroc: 0.5569 - loss: 0.6889
Epoch 3/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 418us/step - auroc: 0.5657 - loss: 0.6862
Epoch 4/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 470us/step - auroc: 0.5653 - loss: 0.6865
Epoch 5/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 442us/step - auroc: 0.5682 - loss: 0.6872
Epoch 6/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 466us/step - auroc: 0.5681 - loss: 0.6815
Epoch 7/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 459us/step - auroc: 0.5754 - loss: 0.6880
Epoch 8/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 

In [11]:
print(np.mean(train_acc_scores))

  arr = asanyarray(a)


ValueError: operands could not be broadcast together with shapes (71993,1) (71994,1) 

In [20]:
# neural network prediction


# pass in item_id, user_id into cool matrix thing
# Generate user-item pairs with known ratings from your ratings dataset
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Merge user and item features into the ratings DataFrame
test_ratings_df = test_ratings_df.merge(user_features_df, left_on='user_id', right_index=True, how='left')
test_ratings_df = test_ratings_df.merge(item_features_df, left_on='item_id', right_index=True, how='left')

# Finding rows where any column has NaN
# cleaned_ratings = ratings_df.dropna()

X_test_true = test_ratings_df[feature_cols]
X_test_true = X_test_true.fillna(0)
print(X_test_true.shape)
y_test_true = test_ratings_df['rating']
X_test_true_scaled = scaler.transform(X_test_true)

# Predict on the test data
y_pred = model.predict(X_test_true_scaled)

# Convert y_pred to DataFrame
pred_df = pd.DataFrame(y_pred, columns=['prediction'])

# Write to a text file without header and index
pred_df.to_csv('predictions.txt', header=False, index=False)


(10000, 104)
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321us/step


KNN Regressor

In [63]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score


# Initialize the KNeighborsRegressor
knn = KNeighborsClassifier(n_neighbors=5)



for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

    knn.fit(X_train, y_train)

    # Predict on the test data
    y_pred = knn.predict(X_test)

    # Calculate the RMSE
    rmse = np.mean(np.square(y_pred - y_test))

    print(f'Root Mean Squared Error: {rmse}')
    # Calculate AUC
    auc = roc_auc_score(y_test, y_pred)
    print("AUC: {:.2f}".format(auc))

Root Mean Squared Error: 0.23762431246180343
AUC: 0.55
Root Mean Squared Error: 0.23412411800655591
AUC: 0.56
Root Mean Squared Error: 0.24091565729525502
AUC: 0.55
Root Mean Squared Error: 0.22796977441937993
AUC: 0.56
Root Mean Squared Error: 0.23385931770196688
AUC: 0.55


In [54]:
# make prediction 

# pass in item_id, user_id into cool matrix thing
# Generate user-item pairs with known ratings from your ratings dataset
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Merge user and item features into the ratings DataFrame
test_ratings_df = test_ratings_df.merge(user_features_df, left_on='user_id', right_index=True, how='left')
test_ratings_df = test_ratings_df.merge(item_features_df, left_on='item_id', right_index=True, how='left')

# Finding rows where any column has NaN
# cleaned_ratings = ratings_df.dropna()

X_test_true = test_ratings_df[feature_cols]
X_test_true = X_test_true.fillna(0)
print(X_test_true.shape)
y_test_true = test_ratings_df['rating']
print(y_test_true.shape)
X_test_true_scaled = scaler.transform(X_test_true)

# Predict on the test data
# Obtain probabilities for the positive class

# Predict on the test data
y_pred = knn.predict_proba(X_test_true_scaled)[:, 1]

# Convert y_pred to DataFrame
pred_df = pd.DataFrame(y_pred, columns=['prediction'])

# Write to a text file without header and index
pred_df.to_csv('predictions_knn.txt', header=False, index=False)



(10000, 8)
(10000,)


ValueError: Input y_true contains NaN.