In [70]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

In [71]:
train_df = pd.read_csv('data/train.csv')
valid_df = pd.read_csv('data/valid.csv')
print(train_df.head())

                  user_id             business_id  stars
0  BG9EcvTb1xrsNNwW9_TV-Q  5PLTCfyndcWM44-yQH-YRg      4
1  AJYhxhRRHlITRRpnpa6uRA  2ukJ2tNmtWfHvpGjYMgVnA      4
2  d0FbFaZuJWfai0RYsUS7qw  LdK6Tqr9-QK8eNezCUes9A      5
3  WYpPk2TiiTb7Fz0S5_ZPxQ  J1bG9ezYHMkzqV1yRsVtQQ      4
4  X-rdf2BAcnhbfz8U0bRAWQ  6swZ2vF-r8TllIuB25eQCQ      5


In [72]:
user_item_rating_df = train_df.pivot_table(index='user_id', columns='business_id', values='stars', fill_value=0)

In [73]:
item_similarity = pd.DataFrame(cosine_similarity(user_item_rating_df.T), index=user_item_rating_df.columns, columns=user_item_rating_df.columns)

In [74]:
def predict_rating(df, user, item):
    # if user not found or item not found, return 0
    if user not in df.index or item not in df.columns:
        return 0
    
    rated_items = df.loc[user, df.loc[user] > 0].index
    similarities = item_similarity.loc[item, rated_items]
    ratings = df.loc[user, rated_items]
    
    if similarities.sum() == 0:
        return 0
    prediction = (ratings * similarities).sum() / similarities.sum()
    return prediction

In [75]:
pred_df = pd.DataFrame(columns=['user_id', 'business_id', 'stars'])
for index, row in valid_df.iterrows():
    stars = predict_rating(user_item_rating_df, row['user_id'], row['business_id'])
    pred_df = pd.concat([pred_df, pd.DataFrame([[row['user_id'], row['business_id'], stars]], columns=['user_id', 'business_id', 'stars'])])

# save prediction
pred_df.to_csv('valid_pred.csv', index=False)