## Loading Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import zscore
import math

## Load Data

In [None]:
df_22_org = pd.read_excel("HW4_Data_2022.xlsx", skiprows=range(1,4), header=0, sheet_name="Form Responses 1")

In [None]:
df_21_org  = pd.read_excel("HW4_Data_2022.xlsx", skiprows=range(1,4), header=0, sheet_name="DBMI Data")

In [None]:
# Cleaning up column names
col_list_22 = df_22_org.columns
col_list_22 = [item.strip().strip("[]") for item in col_list_22]
df_22_org.columns = col_list_22
df_21_org.columns = [item.strip() for item in df_21_org.columns]

In [None]:
df_22_org.head()

In [None]:
df_21_org.head()

In [None]:
index_to_name = {key:value for key, value in enumerate(list(df_22_org["Please write your first name below"] +" "+ df_22_org["Please write your last name below"]))}
name_to_index = {value:key for key, value in enumerate(index_to_name.values())}

In [None]:
len(name_to_index.keys())

In [None]:
#Handling Missing Values and removing unnecessary columns
df_22_org.replace('Did Not Watch', '0', regex=True, inplace=True)
df_22_org.drop(['Serial Number', 
                 'Please write your first name below', 
                 'Please write your last name below',
                 'Please write your section number below (1,2)',
                 'Please write your group number or letter below'],
                 axis=1, 
                 inplace=True)
df_22_org = df_22_org.astype(float)
df_22_org.fillna(0, inplace=True)
df_22_org.info()


df_21_org.replace('Did Not Watch', '0', regex=True, inplace=True)
df_21_org.drop(['User'], axis=1, inplace=True)
df_21_org.iloc[:, 1:]  = df_21_org.iloc[:, 1:].astype(float)
df_21_org.fillna(0, inplace=True)
df_21_org.info()

In [None]:
df_22_org.head()

## Item Based Filtering

In [None]:
u_similarity_matrix_22_org=cosine_similarity(df_22_org)

In [None]:
u_similarity_matrix_22_org.shape

## Testing Out Nearest Neighbor Logic Using Weighted Cosine-Similarity on Raw ratings

In [None]:
username = "Hanqiu Yu"
movie = "La La Land"
k=10
user_index = name_to_index[username]
indices = np.argpartition(u_similarity_matrix_22_org[user_index], -(k+1))[-(k+1):]
indices = [i for i in indices if i != user_index]

In [None]:
u_similarity_matrix_22_org[user_index][indices]


In [None]:
np.dot(df_22_org.iloc[indices].T.loc["La La Land"].values.reshape(1, -1), u_similarity_matrix_22_org[user_index][indices])/sum(u_similarity_matrix_22_org[user_index][indices])

## Original Data User-User using Nearest Neighbors

In [None]:
def get_similar_users_index(username, k=10):
    user_index = name_to_index[username]
    indices = np.argpartition(similarity_matrix_22[user_index], -(k+1))[-(k+1):]
    
    return df_22.iloc[indices]


In [None]:
index = name_to_index["abc"]
df_22_org.iloc[index]["Dunkirk"]

# Question 2 Original Data User-User

In [None]:
df_21_org.columns = [item.strip() for item in df_21_org.columns]

In [None]:
df_combined = pd.concat([df_22_org, df_21_org])

In [None]:
df_combined

In [None]:
np.where(df_combined['A Serious Man'] > 0)

The logic here is as follows:

Since our nearest neighbor algorithm fails for users without any ratings for the movies, we just consider those users who have rated the respective movie.
We calculate their cosine similarity and compute the ratings based on their weighted cosine similarity with us.

In [None]:
u_similarity_matrix_combined =cosine_similarity(df_combined)


def predict_rating_rare_movies(username, movie, k=10):
    user_index = name_to_index[username]
    indices = np.where(df_combined[movie] > 0)
    ratings_movie = df_combined.iloc[indices].T.loc[movie].values.reshape(1, -1)
    print(ratings_movie)
    cosine_similarity_top_k_users = u_similarity_matrix_combined[user_index][indices]
    print(cosine_similarity_top_k_users)
    res = np.dot(ratings_movie, cosine_similarity_top_k_users)/sum(cosine_similarity_top_k_users)
    return res[0]

team_names = ["ABC", "DEF", "GHI", "JKL"]
movies = ["Son of Saul", "Winter's Bone", "A Serious Man"]
for name in team_names:
    for movie in movies:
        print(f"Prediction for {name} for the movie {movie} is: {predict_rating_rare_movies(name, movie)}")
        print("------------------------------------------------------------------------------")

## Question 3 User-User Just Average

In [None]:
avg_rating_avatar = df_combined["Avatar"].sum()/np.where(df_combined["Avatar"] > 0)[0].shape[0]
avg_rating_twows = df_combined["The Wolf of Wall Street"].sum()/np.where(df_combined["The Wolf of Wall Street"] > 0)[0].shape[0]
avg_rating_inception = df_combined["Inception"].sum()/np.where(df_combined["Inception"] > 0)[0].shape[0]

In [None]:
print(f"Avatar rating: {avg_rating_avatar}")
print(f"The Wolf of Wall Street rating: {avg_rating_twows}")
print(f"Inception rating: {avg_rating_inception}")

## Question 3 Original Data Item-Item KNN 

In [None]:
df_combined.T.index

In [None]:
i_similarity_matrix = cosine_similarity(df_combined.T)
movie_to_index = {item: i for i, item in enumerate(list(df_combined.T.index))}

In [None]:
movie = "The Wolf of Wall Street"
k=5
movie_index = movie_to_index[movie]
indices = np.argpartition(i_similarity_matrix[movie_index], -(k+1))[-(k+1):]
indices = [i for i in indices if i != movie_index]

In [None]:
df_combined

In [None]:
def predict_rating_item(username, movie, k=5):
    movie_index = movie_to_index[movie]
    indices = np.argpartition(i_similarity_matrix[movie_index], -(k+1))[-(k+1):]
    indices = [i for i in indices if i != movie_index]
    ratings_movie = df_combined.T.iloc[indices].mean(axis=1).values
    cosine_similarity_top_k_movies = i_similarity_matrix[movie_index][indices]
    res = np.dot(ratings_movie, cosine_similarity_top_k_movies)/sum(cosine_similarity_top_k_movies)
    return res

team_names = ["MNO", "PQR", "STU"]
movies = ["Avatar", "Inception", "The Wolf of Wall Street"]
for name in team_names:
    for movie in movies:
        print(f"Prediction for {name} for the movie {movie} is: {predict_rating_item(name, movie)}")
        print("------------------------------------------------------------------------------")

## Question 3 Original Data Item-Item KNN ZScored

In [None]:
df_combined_z_scored = pd.read_csv("item_item_z_scored.csv")

In [None]:
df_combined_z_scored.columns = col_list_22[5:]
df_combined_z_scored = df_combined_z_scored.T

In [None]:
movie = "The Wolf of Wall Street"
k=5
i_similarity_matrix_z_scored = cosine_similarity(df_combined_z_scored)
movie_index = movie_to_index[movie]
indices = np.argpartition(i_similarity_matrix_z_scored[movie_index], -(k+1))[-(k+1):]
indices = [i for i in indices if i != movie_index]

In [None]:
df_combined.T.iloc[indices]

In [None]:
ratings_movie = df_combined_z_scored.T.iloc[indices].mean(axis=1).values
print(ratings_movie)
cosine_similarity_top_k_movies = i_similarity_matrix_z_scored[movie_index][indices]
res = df_combined.T.iloc[movie_index].std() * np.dot(ratings_movie, cosine_similarity_top_k_movies)/sum(cosine_similarity_top_k_movies) + df_combined.T.iloc[movie_index].mean()

In [None]:
np.dot(ratings_movie, cosine_similarity_top_k_movies)/sum(cosine_similarity_top_k_movies)

In [None]:
df_combined.T.iloc[34].mean()

In [None]:
def predict_rating_item(movie, k=5):
    movie_index = movie_to_index[movie]
    indices = np.argpartition(i_similarity_matrix_z_scored[movie_index], -(k+1))[-(k+1):]
    indices = [i for i in indices if i != movie_index]
    ratings_movie = df_combined_z_scored.iloc[indices].mean(axis=1).values
    cosine_similarity_top_k_movies = i_similarity_matrix_z_scored[movie_index][indices]
    res = df_combined.T.iloc[movie_index].std() * np.dot(ratings_movie, cosine_similarity_top_k_movies)/sum(abs(cosine_similarity_top_k_movies)) + df_combined.T.iloc[movie_index].mean()
    return res
    
team_names = ["MNO", "PQR", "STU"]
movies = ["Avatar", "Inception", "The Wolf of Wall Street"]
for name in team_names:
    for movie in movies:
        print(f"Prediction for {name} for the movie {movie} is: {predict_rating_item(movie)}")
        print("------------------------------------------------------------------------------")

## Question 4 User-User KNN

In [None]:
df_22_ref = pd.read_excel("HW4_Data_2022.xlsx", skiprows=range(1,4), header=0, sheet_name="Form Responses 2")

In [None]:
df_22_ref.shape

In [None]:
# Cleaning up column names
col_list_22 = df_22_ref.columns
col_list_22 = [item.strip().strip("[]") for item in col_list_22]
df_22_ref.columns = col_list_22

In [None]:
df_22_ref.tail(3)["Mad Max: Fury Road"]

In [None]:
index_to_name = {key:value for key, value in enumerate(list(df_22_ref["Please write your first name below"] +" "+ df_22_ref["Please write your last name below"]))}
name_to_index = {value:key for key, value in enumerate(index_to_name.values())}

In [None]:
#Handling Missing Values and removing unnecessary columns
df_22_ref.replace('Did Not Watch', '0', regex=True, inplace=True)
df_22_ref.drop(['Serial Number', 
                 'Please write your first name below', 
                 'Please write your last name below',
                 'Please write your section number below (1,2)',
                 'Please write your group number or letter below'],
                 axis=1, 
                 inplace=True)
df_22_ref = df_22_ref.astype(float)
df_22_ref.fillna(0, inplace=True)
#df_22_ref.info()

In [None]:
df_combined_ref = pd.concat([df_22_ref, df_21_org])

In [None]:
df_combined_ref

In [None]:
u_similarity_matrix_ref = cosine_similarity(df_combined_ref)

In [None]:
u_similarity_matrix_ref.shape

In [None]:
df_combined_ref.iloc[indices].T.loc[movie].values.reshape(1, -1)

In [None]:
def predict_rating(username, movie, k=10):
    user_index = name_to_index[username]
    indices = np.argpartition(u_similarity_matrix_ref[user_index], -(k+1))[-(k+1):]
    indices = [i for i in indices if i != user_index]
    ratings_movie = df_combined_ref.iloc[indices].T.loc[movie].values.reshape(1, -1)
    cosine_similarity_top_k_users = u_similarity_matrix_ref[user_index][indices]
    res = np.dot(ratings_movie, cosine_similarity_top_k_users)/sum(abs(cosine_similarity_top_k_users))
    return res[0]

team_names = ["MNO", "PQR", "STU"]
movies = ["Avatar", "Inception", "The Wolf of Wall Street"]
for name in team_names:
    for movie in movies:
        print(f"Prediction for {name} for the movie {movie} is: {predict_rating(name, movie)}")
        print("------------------------------------------------------------------------------")

## Question 4 KNN User-User ZScore

In [None]:
df_22_ref_z_scored = pd.read_excel("./hw4_P2_refresh.xlsx", sheet_name="Normalized User ")

In [None]:
name_to_index["MNO"] = 98
name_to_index["PQR"] = 99
name_to_index["STU"] = 100

In [None]:
df_22_ref_z_scored.drop(["User-Normalized"], axis=1, inplace=True)

In [None]:
df_22_ref_z_scored = df_22_ref_z_scored.T
df_22_ref_z_scored.columns = df_21_org.columns

In [None]:
df_22_ref_z_scored.fillna(0, inplace=True)

In [None]:
df_combined_ref = pd.concat([df_22_ref, df_21_org])

In [None]:
df_combined_ref

In [None]:
u_similarity_matrix_ref = cosine_similarity(df_combined_ref)

In [None]:
df_22_ref.iloc[99].sum()/df_22_ref.iloc[np.where(df_22_ref.iloc[99] > 0)].shape[0]

In [None]:
df_22_ref.iloc[99]

In [None]:
mean = df_combined.iloc[99].sum()/df_combined.iloc[np.where(df_22_ref.iloc[99] > 0)].shape[0]
sum((df_22_ref.iloc[99][np.where(df_22_ref.iloc[99] > 0)[0]] - mean) ** 2)/df_22_ref.iloc[np.where(df_22_ref.iloc[99] > 0)].shape[0]

In [None]:
df_combined.iloc[99].sum()/df_combined.iloc[np.where(df_combined.iloc[99] > 0)].shape[0]

In [None]:
df_22_ref.iloc[100]

In [None]:
abs(u_similarity_matrix_ref[user_index][indices])

In [None]:
(df_22_ref.iloc[100][np.where(df_22_ref.iloc[100] > 0)[0]] - mean) ** 2

In [None]:
name_to_index

In [None]:
def predict_rating(username, movie, k=10):
    user_index = name_to_index[username]
    indices = np.argpartition(u_similarity_matrix_ref[user_index], -(k+1))[-(k+1):]
    indices = [i for i in indices if i != user_index]
    ratings_movie = df_combined_ref.iloc[indices].T.loc[movie].values.reshape(1, -1)
    cosine_similarity_top_k_users = u_similarity_matrix_ref[user_index][indices]
    mean = df_combined.iloc[user_index].sum()/df_combined.iloc[np.where(df_combined.iloc[user_index] > 0)].shape[0]
    stddev = math.sqrt(sum((df_combined.iloc[user_index][np.where(df_combined.iloc[user_index] > 0)[0]] - mean) ** 2)/(df_combined.iloc[np.where(df_combined.iloc[user_index] > 0)].shape[0] - 1))
    print(f"Mean: {mean}")
    print(f"Std Dev: {stddev}")
    print(f"{np.dot(ratings_movie, cosine_similarity_top_k_users)/sum(abs(cosine_similarity_top_k_users))}")
    res =  stddev * np.dot(ratings_movie, cosine_similarity_top_k_users)/sum(abs(cosine_similarity_top_k_users)) + mean
    return res[0]

team_names = ["MNO", "PQR", "STU"]
movies = ["Avatar", "Inception", "The Wolf of Wall Street"]
for name in team_names:
    for movie in movies:
        print(f"Prediction for {name} for the movie {movie} is: {predict_rating(name, movie)}")
        print("------------------------------------------------------------------------------")

## Question 4 KNN Item-Item ZScore

In [None]:
df_22_ref_z_scored_item = pd.read_csv("./item_item_z_scored_ref.csv")

In [None]:
df_22_ref_z_scored_item.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
df_22_ref_z_scored_item = df_22_ref_z_scored_item.T
df_22_ref_z_scored_item.columns = df_21_org.columns

In [None]:
i_similarity_matrix = cosine_similarity(df_22_ref_z_scored_item.T)

In [None]:
mean = df_combined.iloc[:,movie_to_index["Avatar"]].sum()/np.where(df_combined.iloc[:, movie_to_index["Avatar"]] > 0)[0].shape[0]

In [None]:
def predict_rating(username, movie, k=10):
    user_index = name_to_index[username]
    indices = np.argpartition(u_similarity_matrix_ref[user_index], -(k+1))[-(k+1):]
    indices = [i for i in indices if i != user_index]
    ratings_movie = df_combined_ref.iloc[indices].T.loc[movie].values.reshape(1, -1)
    cosine_similarity_top_k_users = u_similarity_matrix_ref[user_index][indices]
    mean = df_combined.iloc[:,movie_to_index["Avatar"]].sum()/np.where(df_combined.iloc[:, movie_to_index["Avatar"]] > 0)[0].shape[0]
    stddev = math.sqrt(sum((df_22_ref.iloc[user_index][np.where(df_22_ref.iloc[user_index] > 0)[0]] - mean) ** 2)/(df_22_ref.iloc[np.where(df_22_ref.iloc[user_index] > 0)].shape[0] - 1))
    print(f"Mean: {mean}")
    print(f"Std Dev: {stddev}")
    print(f"{np.dot(ratings_movie, cosine_similarity_top_k_users)/sum(abs(cosine_similarity_top_k_users))}")
    res =  stddev * np.dot(ratings_movie, cosine_similarity_top_k_users)/sum(abs(cosine_similarity_top_k_users)) + mean
    return res[0]

team_names = ["MNO", "PQR", "STU"]
movies = ["Avatar", "Inception", "The Wolf of Wall Street"]
for name in team_names:
    for movie in movies:
        print(f"Prediction for {name} for the movie {movie} is: {predict_rating(name, movie)}")
        print("------------------------------------------------------------------------------")

## Test Cosine Similarity Logic Original Data User-User 

In [None]:
df_test = pd.read_csv("")
df_test.fillna(0, inplace=True)
df_test.drop(['user'], axis=1, inplace=True)

In [None]:
test_similarity_matrix = cosine_similarity(df_test.T)

In [None]:
test_similarity_matrix.shape

In [None]:
test_similarity_matrix[0][1:]

In [None]:
np.dot(df_test.iloc[1, 1:].fillna(0).values, test_similarity_matrix[0][1:])/sum(test_similarity_matrix[0][1:])

In [None]:
df_test

In [None]:
np.dot(df_test.iloc[0, 1:].fillna(0).values, test_similarity_matrix[0][1:])/sum(test_similarity_matrix[0][1:])

## Test ZScore Scipy

In [None]:
df_combined["The Social Network"]

In [None]:
df_combined_zscore