In [273]:
import ast
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

## Build Movie-Attribute Graph

In [277]:
ratings_df = pd.read_csv("data/ratings_small.csv")
temp_movies_df = pd.read_csv("generated_data/movies_small.csv")
# temp_movies_df = pd.read_csv("generated_data/movies_small_with_clusters.csv")
temp_credits_df = pd.read_csv("generated_data/credits_small.csv")

In [280]:
# create new dataframes with relevant features, after processing 
# features that are not useful in calculating the Adamic-Adar measure are not included
movies_df = pd.DataFrame() 
movies_df["collection"] = temp_movies_df["belongs_to_collection"].fillna("{}")
movies_df["budget"] = np.log10(pd.to_numeric(temp_movies_df["budget"]).fillna(0).replace(0, 1)).round(0).astype(int) # order of magnitude of budget
movies_df["genres"] = temp_movies_df["genres"].fillna("[]")
movies_df["language"] = temp_movies_df["original_language"].fillna("")
movies_df["companies"] = temp_movies_df["production_companies"].fillna("[]")
movies_df["countries"] = temp_movies_df["production_countries"].fillna("[]")
movies_df["revenue"] = np.log10(temp_movies_df["revenue"].fillna(0).replace(0, 1)).round(0).astype(int) # order of magnitude of revenue
# movies_df["overview_cluster"] = temp_movies_df["overview_cluster"]
movies_df["id"] = temp_movies_df["movieId"]
movies_df = movies_df.set_index("id")
movies_df = movies_df[~movies_df.index.duplicated(keep="first")] # drop rows with duplicate indices

credits_df = pd.DataFrame()
credits_df["cast"] = temp_credits_df["cast"].fillna("[]")
credits_df["id"] = temp_credits_df["movieId"]
credits_df = credits_df.set_index("id")
credits_df = credits_df[~credits_df.index.duplicated(keep="first")] # drop rows with duplicate indices

In [281]:
movies_df

Unnamed: 0_level_0,collection,budget,genres,language,companies,countries,revenue
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"{'id': 10194, 'name': 'Toy Story Collection', ...",7,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",9
2,{},8,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",8
3,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",en,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",0
4,{},7,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",en,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",8
5,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",en,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",8
...,...,...,...,...,...,...,...
161918,"{'id': 286023, 'name': 'Sharknado Collection',...",0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",en,"[{'name': 'The Asylum', 'id': 1311}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",0
161944,{},7,"[{'id': 18, 'name': 'Drama'}]",en,"[{'name': 'Nasser Entertainment', 'id': 35802}]","[{'iso_3166_1': 'US', 'name': 'United States o...",0
162542,{},6,"[{'id': 53, 'name': 'Thriller'}, {'id': 10749,...",hi,"[{'name': 'KriArj Entertainment', 'id': 91689}]","[{'iso_3166_1': 'IN', 'name': 'India'}]",0
162672,{},7,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",hi,"[{'name': 'UTV Motion Pictures', 'id': 2320}, ...","[{'iso_3166_1': 'IN', 'name': 'India'}]",7


In [282]:
credits_df

Unnamed: 0_level_0,cast
id,Unnamed: 1_level_1
1,"[{'cast_id': 14, 'character': 'Woody (voice)',..."
2,"[{'cast_id': 1, 'character': 'Alan Parrish', '..."
3,"[{'cast_id': 2, 'character': 'Max Goldman', 'c..."
4,"[{'cast_id': 1, 'character': ""Savannah 'Vannah..."
5,"[{'cast_id': 1, 'character': 'George Banks', '..."
...,...
161918,"[{'cast_id': 0, 'character': 'Fin Shepard', 'c..."
161944,"[{'cast_id': 1, 'character': 'Henry Cobb', 'cr..."
162542,"[{'cast_id': 0, 'character': 'Rustom Pavri', '..."
162672,"[{'cast_id': 0, 'character': 'Sarman', 'credit..."


In [311]:
edges = set()

for i in movies_df.index:
    row = movies_df.loc[i, :]
    
    collection = ast.literal_eval(row["collection"])
    if collection:
        value = collection["id"]
        edges.add((f"movie_{i}", f"collection_{value}"))
    
    budget = row["budget"]
    edges.add((f"movie_{i}", f"budget_{budget}"))
    
    genres = ast.literal_eval(row["genres"])
    if genres:
        for obj in genres:
            value = obj["id"]
            edges.add((f"movie_{i}", f"genre_{value}"))
    
    language = row["language"]
    edges.add((f"movie_{i}", f"language_{language}"))
    
    companies = ast.literal_eval(row["companies"])
    if companies:
        for obj in companies:
            value = obj["id"]
            edges.add((f"movie_{i}", f"company_{value}"))
    
    countries = ast.literal_eval(row["countries"])
    if countries:
        for obj in countries:
            value = obj["iso_3166_1"]
            edges.add((f"movie_{i}", f"country_{value}"))
    
    revenue = row["revenue"]
    edges.add((f"movie_{i}", f"revenue_{revenue}")) 
    
#     overview_cluster = row["overview_cluster"]
#     edges.add((f"movie_{i}", f"overview_{overview_cluster}"))
    
    if i in credits_df.index:
        cast = ast.literal_eval(credits_df.loc[i, "cast"])
        for member in cast:
            value = member["id"]
            edges.add((f"movie_{i}", f"cast_{value}"))

In [312]:
g = nx.Graph(list(edges))

## Predict Ratings

In [313]:
ratings_df = ratings_df[ratings_df["movieId"].isin(movies_df.index)]

In [314]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [315]:
# try predicting ratings for just one user 
sample = ratings_df[ratings_df["userId"] == 1].sort_values("timestamp")

In [316]:
sample

Unnamed: 0,userId,movieId,rating,timestamp
16,1,2294,2.0,1260759108
17,1,2455,2.5,1260759113
19,1,3671,3.0,1260759117
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131
10,1,1371,2.5,1260759135
13,1,2105,4.0,1260759139
0,1,31,2.5,1260759144
7,1,1293,2.0,1260759148
5,1,1263,2.0,1260759151


In [317]:
n = int(len(sample) * 0.80) # 80-20 split for train-test data
train_df = sample.iloc[:n, :].copy()
test_df = sample.iloc[n:, :].copy()

In [318]:
previous_ratings = train_df[["movieId", "rating"]].set_index("movieId")
seen_movies = previous_ratings.index

In [319]:
def predict_rating(row):
    my_id = int(row.movieId)
    weighted_rating = 0
    total_adamic_adar = 0
    
    for other_id in seen_movies:
        adamic_adar_index = [p for u, v, p in nx.adamic_adar_index(g, [(f"movie_{other_id}", f"movie_{my_id}")])][0]
        weighted_rating += previous_ratings.loc[other_id, "rating"] * adamic_adar_index
        total_adamic_adar += adamic_adar_index
        
    return weighted_rating / total_adamic_adar

In [320]:
test_df["predicted_rating"] = test_df.apply(predict_rating, axis=1);

In [321]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,predicted_rating
15,1,2193,2.0,1260759198,2.658236
18,1,2968,1.0,1260759200,2.679921
11,1,1405,1.0,1260759203,2.643282
4,1,1172,4.0,1260759205,2.517064


In [322]:
test_df.sort_values("predicted_rating").index

Int64Index([4, 11, 15, 18], dtype='int64')

In [323]:
test_df.sort_values("rating").index

Int64Index([18, 11, 15, 4], dtype='int64')