In [30]:

# to do all the loss calculations, since automatic gradients are needed
import numpy as np

# Use helper packages
from AbstractBaseCollabFilterSGD import AbstractBaseCollabFilterSGD
from train_valid_test_loader import load_train_valid_test_datasets

from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate

# Some packages you might need (uncomment as necessary)
import pandas as pd
import matplotlib.pyplot as plt
print("hi")

hi


In [31]:
## Load the entire dev set in surprise's format
reader = Reader(
    line_format='user item rating', sep=',',
    rating_scale=(1, 5), skip_lines=1)

train_set = Dataset.load_from_file(
    '../data_movie_lens_100k/ratings_all_development_set.csv', reader=reader)

train_set = train_set.build_full_trainset()

# Use the SVD algorithm
n_factors = 50
## Fit model like our M3
model = SVD(n_factors=n_factors)
model.fit(train_set)

print("global mean:")
print(model.trainset.global_mean)
print("shape of bias_per_item: ")
print(model.bi.shape)
print("shape of bias_per_user: ")
print(model.bu.shape)
print("shape of U (per user vectors): ")
print(model.pu.shape) # pu is the user vector (ui)
print("shape of V (per item vectors): ")
print(model.qi.shape) # qi is the item vector (uv)


# 
user_info_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movie_info_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
movie_info_df = movie_info_df.drop('title', axis=1)

print("user info: ", user_info_df.shape)
print("movie shape: ", movie_info_df.shape)

# compare shapes

rated_movies = train_set.all_items()
rated_movie_ids = [int(train_set.to_raw_iid(i)) for i in rated_movies]  # Convert to raw ids

# Keep only movies in movie_info_df that have ratings
clean_movie_info_df = movie_info_df[movie_info_df['item_id'].isin(rated_movie_ids)]
print(f"Movies with ratings retained: {clean_movie_info_df.shape}")


# rename it so it's usable
users_U = model.pu
movies_V = model.qi
user_features = user_info_df
movie_features = clean_movie_info_df

ratings_y = train_set.all_ratings


global mean:
3.529480398257623
shape of bias_per_item: 
(1662,)
shape of bias_per_user: 
(943,)
shape of U (per user vectors): 
(943, 50)
shape of V (per item vectors): 
(1662, 50)
user info:  (943, 4)
movie shape:  (1681, 3)
Movies with ratings retained: (1662, 3)


In [32]:
# pipeline:

# 
# collapse U, V, user_feature, item_feature into one matrix to train

# put U and user_features together:
# Prepare user features
user_ids = list(range(0, 942))  # list of all user inner ids
user_features_list = []
for inner_id in user_ids:
    user_vector = users_U[inner_id]
    user_id = model.trainset.to_raw_uid(inner_id)
    # drop user IDs and original user IDs that aren't needed
    user_meta = user_info_df.loc[user_info_df['user_id'] == int(user_id)].drop(['user_id', 'orig_user_id'], axis=1)
    if not user_meta.empty:
        combined_features = np.concatenate([user_vector, user_meta.iloc[0].values])
        user_features_list.append(combined_features)

# Convert to DataFrame
user_features_df = pd.DataFrame(user_features_list, columns=[f'feature_{i}' for i in range(len(user_features_list[0]))])

# put V and item_features together:
# Prepare item features
item_ids = list(range(model.trainset.n_items))  # list of all item inner ids
item_features_list = []
for inner_id in item_ids:
    item_vector = model.qi[inner_id]
    item_id = model.trainset.to_raw_iid(inner_id)
    item_meta = movie_features.loc[movie_features['item_id'] == int(item_id)].drop(['item_id'], axis=1)
    if not item_meta.empty:
        combined_features = np.concatenate([item_vector, item_meta.iloc[0].values])
        item_features_list.append(combined_features)

# Convert to DataFrame
item_features_df = pd.DataFrame(item_features_list, columns=[f'feature_{i}' for i in range(len(item_features_list[0]))])



# after that for predcitions:

# on prediction:
    # take in user_id, item_id
    # get U and V from SVG algorithm 
    # get corresponding features of users/items
# condense the features and put into classifier for prediction

# get output of at least 4.5



In [33]:
# Generate user-item pairs with known ratings from your ratings dataset
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')

# Merge user and item features into the ratings DataFrame
ratings_df = ratings_df.merge(user_features_df, left_on='user_id', right_index=True, how='left')
ratings_df = ratings_df.merge(item_features_df, left_on='item_id', right_index=True, how='left')

print(ratings_df[0:5])

# Finding rows where any column has NaN

# Substitute NaN values with 0 in the entire DataFrame
ratings_df.fillna(0, inplace=True)

ratings_df.fillna(0, inplace=True)
cleaned_ratings = ratings_df
# Optionally, check the first few rows to confirm the substitution
print(ratings_df.head(5))

   user_id  item_id  rating  feature_0_x  feature_1_x  feature_2_x  \
0      772       36       3    -0.033616     0.095223    -0.118229   
1      471      228       5     0.205900     0.115125    -0.096749   
2      641      401       4    -0.004301    -0.125513    -0.183563   
3      312       98       4     0.253024    -0.163443    -0.000422   
4       58      504       5    -0.043214     0.105361    -0.040030   

   feature_3_x  feature_4_x  feature_5_x  feature_6_x  ...  feature_42_y  \
0     0.178290     0.005156    -0.039568     0.057018  ...      0.001816   
1     0.202390    -0.280768     0.084766    -0.022707  ...      0.105970   
2    -0.130856     0.048324     0.044227    -0.014226  ...     -0.144838   
3     0.060634     0.022261    -0.125041    -0.028649  ...      0.100479   
4    -0.078184     0.088726     0.019134     0.027954  ...     -0.172414   

   feature_43_y  feature_44_y  feature_45_y  feature_46_y  feature_47_y  \
0     -0.064778     -0.251904     -0.003687    

In [34]:
# process yo data here
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

feature_cols = [col for col in ratings_df.columns if col.startswith('feature')]

# Prepare data
X = cleaned_ratings[feature_cols]  # all your feature columns
y = cleaned_ratings['rating']      # or a binary column if you're classifying

y = (cleaned_ratings['rating'] >= 4.5).astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


# Create a scaler object
scaler = StandardScaler()

# Fit on training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X = scaler.fit_transform(X)


Neural Network

In [43]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import AUC
from tensorflow.keras.regularizers import l1, l2, l1_l2

input_shape = X.shape[1]

def create_model(input_shape):
    # Define a neural network model
    model = tf.keras.Sequential([
        Input(shape=(input_shape,)),
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=l2(0.00005)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(1)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = create_model(input_shape)
auc_scores = []
fold_no = 1
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Create a new model (important to avoid leakage from previous iterations)
    model = create_model(X.shape[1])
    
    # Train model
    print(f'Training for fold {fold_no} ...')
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
    
    # Evaluate model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[1]} of {scores[1]*100}%')
    auc_scores.append(scores[1])
    fold_no += 1

print(np.mean(auc_scores))

Training for fold 1 ...
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7668 - loss: 9.2250
Epoch 2/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7904 - loss: 0.5190
Epoch 3/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7873 - loss: 0.5192
Epoch 4/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7869 - loss: 0.5159
Epoch 5/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7898 - loss: 0.5090
Epoch 6/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7848 - loss: 0.5103
Epoch 7/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7910 - loss: 0.5036
Epoch 8/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7925 - loss: 0.5000
Epoch 9/10
[1m2250/2250[0m [32m━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7884 - loss: 9.2148
Epoch 2/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7892 - loss: 0.5229
Epoch 3/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7908 - loss: 0.5171
Epoch 4/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7881 - loss: 0.5172
Epoch 5/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7884 - loss: 0.5115
Epoch 6/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7916 - loss: 0.5063
Epoch 7/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7898 - loss: 0.5025
Epoch 8/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7893 - loss: 0.5029
Epoch 9/10
[1m2250/2250[0m [32m━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7870 - loss: 9.1690
Epoch 2/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7901 - loss: 0.5214
Epoch 3/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7894 - loss: 0.5195
Epoch 4/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7903 - loss: 0.5152
Epoch 5/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7909 - loss: 0.5102
Epoch 6/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7909 - loss: 0.5075
Epoch 7/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7905 - loss: 0.5063
Epoch 8/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7883 - loss: 0.5054
Epoch 9/10
[1m2250/2250[0m [32m━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7796 - loss: 9.3185
Epoch 2/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7847 - loss: 0.5270
Epoch 3/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7833 - loss: 0.5259
Epoch 4/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7874 - loss: 0.5175
Epoch 5/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7900 - loss: 0.5115
Epoch 6/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7854 - loss: 0.5131
Epoch 7/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7882 - loss: 0.5071
Epoch 8/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7901 - loss: 0.5020
Epoch 9/10
[1m2250/2250[0m [32m━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7479 - loss: 9.3425
Epoch 2/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7873 - loss: 0.5237
Epoch 3/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7855 - loss: 0.5211
Epoch 4/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7863 - loss: 0.5177
Epoch 5/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7855 - loss: 0.5150
Epoch 6/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7862 - loss: 0.5105
Epoch 7/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7845 - loss: 0.5094
Epoch 8/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7897 - loss: 0.5046
Epoch 9/10
[1m2250/2250[0m [32m━

In [49]:
# neural network prediction


# pass in item_id, user_id into cool matrix thing
# Generate user-item pairs with known ratings from your ratings dataset
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Merge user and item features into the ratings DataFrame
test_ratings_df = test_ratings_df.merge(user_features_df, left_on='user_id', right_index=True, how='left')
test_ratings_df = test_ratings_df.merge(item_features_df, left_on='item_id', right_index=True, how='left')

# Finding rows where any column has NaN
# cleaned_ratings = ratings_df.dropna()

X_test_true = test_ratings_df[feature_cols]
X_test_true = X_test_true.fillna(0)
print(X_test_true.shape)
y_test_true = test_ratings_df['rating']
X_test_true_scaled = scaler.transform(X_test_true)

# Predict on the test data
y_pred = model.predict(X_test_true_scaled)
y_pred = (y_pred >= 0.5).astype(int)

# Convert y_pred to DataFrame
pred_df = pd.DataFrame(y_pred, columns=['prediction'])

# Write to a text file without header and index
pred_df.to_csv('predictions.txt', header=False, index=False)


(10000, 104)
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 764us/step
[[0]
 [0]
 [0]
 [0]
 [0]]


KNN Regressor

In [None]:

from sklearn.neighbors import KNeighborsRegressor



# Initialize the KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = knn.predict(X_test_scaled)

# Calculate the RMSE
rmse = np.mean(np.square(y_pred - y_test))
print(f'Root Mean Squared Error: {rmse}')

In [88]:
# make prediction 

# pass in item_id, user_id into cool matrix thing
# Generate user-item pairs with known ratings from your ratings dataset
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Merge user and item features into the ratings DataFrame
test_ratings_df = test_ratings_df.merge(user_features_df, left_on='user_id', right_index=True, how='left')
test_ratings_df = test_ratings_df.merge(item_features_df, left_on='item_id', right_index=True, how='left')

# Finding rows where any column has NaN
# cleaned_ratings = ratings_df.dropna()

X_test_true = test_ratings_df[feature_cols]
X_test_true = X_test_true.fillna(0)
print(X_test_true.shape)
y_test_true = test_ratings_df['rating']
X_test_true_scaled = scaler.transform(X_test_true)

# Predict on the test data
y_pred = knn.predict(X_test_true_scaled)

predicted_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

predicted_df['rating'] = y_pred

binary_df = {
    'binary' : []
}

predicted_df['rating'] = predicted_df['rating'].astype(float)
# Create a new 'binary' column based on the condition
predicted_df['binary'] = np.where(predicted_df['rating'] >= 4.5, 1, 0)
# Export the DataFrame to a new CSV file
predicted_df['binary'].to_csv('predictions.csv', index=False)



(10000, 105)
