In [1]:
import pandas as pd
import numpy as np
from typing import List
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.evaluation import RegressionEvaluator

# **Data Collection and Preprocessing** #

## **import data** ##

### MovieLens Dataset ###

In [2]:
%%bash

if [ ! -d "../data/movielens_complete" ]; 
then    
    wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
    mkdir -p ../data/movielens_complete
    unzip -o ml-1m.zip -d ../data/movielens_complete;
    rm ml-1m.zip;
else
    echo "Data already downloaded";
fi

Data already downloaded


### IMDb Dataset ###

In [3]:
%%bash

if [ ! -d "../data/imbws_complete" ]; 
then    
    wget https://datasets.imdbws.com/name.basics.tsv.gz
    wget https://datasets.imdbws.com/title.akas.tsv.gz
    wget https://datasets.imdbws.com/title.basics.tsv.gz
    wget https://datasets.imdbws.com/title.crew.tsv.gz
    wget https://datasets.imdbws.com/title.episode.tsv.gz
    wget https://datasets.imdbws.com/title.principals.tsv.gz
    wget https://datasets.imdbws.com/title.ratings.tsv.gz
    mkdir -p ../data/imbws_complete
    unzip -o * -d ../data/imbws_complete;
    rm -rf *.gz;
else
    echo "Data already downloaded";
fi

Data already downloaded


## Loading Data ##

### ml-1m

In [4]:
df_rating = pd.read_csv(
    "../data/movielens_complete/ml-1m/ratings.dat",
    sep='::',
    engine="python",
    header=None,
)

df_rating.columns = ["UserId", "MovieId", "Rating", "Timestamp"]

df_rating

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
df_users = pd.read_csv(
    "../data/movielens_complete/ml-1m/users.dat",
    sep="::",
    engine="python",
    header=None,
)
df_users.columns = ["UserId", "Gender", "Age", "Occupation", "ZipCode"]
df_users.set_index("UserId", inplace=True)
df_users.head()

Unnamed: 0_level_0,Gender,Age,Occupation,ZipCode
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [6]:
df_items = pd.read_csv(
    "../data/movielens_complete/ml-1m/movies.dat",
    engine="python",
    sep="::",
    encoding="ISO-8859-1",
    header=None,
)
df_items.columns = ["MovieId", "Title", "Genres"]
df_items.set_index("MovieId", inplace=True)
df_items.head()

Unnamed: 0_level_0,Title,Genres
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


### imbws

In [7]:
df_imbws_title_basics = pd.read_csv(
    "../data/imbws_complete/title.basics.tsv",
    engine="python",
    sep="\t",
    encoding="utf-8",
    on_bad_lines='skip'  # or use 'warn' to get warnings instead of skipping
)

df_imbws_title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10851947,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10851948,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10851949,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10851950,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [8]:
df_imbws_title_ratings = pd.read_csv(
    "../data/imbws_complete/title.ratings.tsv",
    engine="python",
    sep="\t",
    encoding="ISO-8859-1",
)
df_imbws_title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2022
3,tt0000004,5.4,179
4,tt0000005,6.2,2787
...,...,...,...
1447549,tt9916730,7.0,12
1447550,tt9916766,7.1,23
1447551,tt9916778,7.2,36
1447552,tt9916840,7.2,10


In [9]:
df_imbws_title_crew = pd.read_csv(
    "../data/imbws_complete/title.crew.tsv",
    engine="python",
    sep="\t",
    encoding="ISO-8859-1",
)
df_imbws_title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
10206151,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10206152,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
10206153,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10206154,tt9916856,nm10538645,nm6951431


## Preprocessing

### Ratings each movies

In [10]:
df_matrix = df_rating.pivot(index="UserId", columns="MovieId", values="Rating")

n_users = len(df_users)
n_items = len(df_items)

In [11]:
df_matrix

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [12]:
movie_scores = df_matrix.sum(axis=0, skipna=True) / df_matrix.count(axis=0)
movie_scores

MovieId
1       4.146846
2       3.201141
3       3.016736
4       2.729412
5       3.006757
          ...   
3948    3.635731
3949    4.115132
3950    3.666667
3951    3.900000
3952    3.780928
Length: 3706, dtype: float64

### Preprocess of movies

In [13]:
df_items["averageRating"] = movie_scores
df_items["numVotes"] = df_matrix.count(axis=0)
df_items[['Title', 'Year']] = df_items['Title'].str.extract(r'^(.*) \((\d{4})\)$')
df_items.head()

Unnamed: 0_level_0,Title,Genres,averageRating,numVotes,Year
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,Animation|Children's|Comedy,4.146846,2077.0,1995
2,Jumanji,Adventure|Children's|Fantasy,3.201141,701.0,1995
3,Grumpier Old Men,Comedy|Romance,3.016736,478.0,1995
4,Waiting to Exhale,Comedy|Drama,2.729412,170.0,1995
5,Father of the Bride Part II,Comedy,3.006757,296.0,1995


### Preprocess imbw

In [14]:
df_imbws_title_basics = df_imbws_title_basics[df_imbws_title_basics['titleType'] == 'movie']
df_imbws_title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
...,...,...,...,...,...,...,...,...,...
10851843,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,\N,57,Documentary
10851870,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
10851882,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
10851892,tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,Drama


In [15]:
df_imbws_title_ratings['averageRating'] = df_imbws_title_ratings['averageRating'] / 2
df_imbws_title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,2.85,2058
1,tt0000002,2.85,276
2,tt0000003,3.25,2022
3,tt0000004,2.70,179
4,tt0000005,3.10,2787
...,...,...,...
1447549,tt9916730,3.50,12
1447550,tt9916766,3.55,23
1447551,tt9916778,3.60,36
1447552,tt9916840,3.60,10


In [16]:
df_title_ratings = pd.merge(df_imbws_title_basics, df_imbws_title_ratings, on='tconst', how='left')
df_title_ratings = df_title_ratings.rename(columns={'genres': 'imbwGenres', 'averageRating': 'imbwAverageRating', 'numVotes': 'imbwNumVotes'})
df_title_ratings = df_title_ratings.drop(['titleType', 'isAdult', 'endYear'], axis=1)
df_title_ratings

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,imbwGenres,imbwAverageRating,imbwNumVotes
0,tt0000009,Miss Jerry,Miss Jerry,1894,45,Romance,2.70,212.0
1,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,100,"Documentary,News,Sport",2.60,517.0
2,tt0000502,Bohemios,Bohemios,1905,100,\N,2.20,17.0
3,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography",3.00,909.0
4,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90,Drama,2.85,25.0
...,...,...,...,...,...,...,...,...
683869,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57,Documentary,,
683870,tt9916680,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,2007,100,Documentary,,
683871,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,\N,Comedy,3.80,5.0
683872,tt9916730,6 Gunn,6 Gunn,2017,116,Drama,3.50,12.0


### Merging imbw data and movieLens data

In [17]:
merged = pd.merge(df_title_ratings, df_items.reset_index(), left_on="primaryTitle", right_on="Title", how="outer")
merged = merged.sort_values(by='tconst')
merged

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,imbwGenres,imbwAverageRating,imbwNumVotes,MovieId,Title,Genres,averageRating,numVotes,Year
361911,tt0000009,Miss Jerry,Miss Jerry,1894,45,Romance,2.70,212.0,,,,,,
544312,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,100,"Documentary,News,Sport",2.60,517.0,,,,,,
81934,tt0000502,Bohemios,Bohemios,1905,100,\N,2.20,17.0,,,,,,
596625,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography",3.00,909.0,,,,,,
585583,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90,Drama,2.85,25.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676304,,,,,,,,,1934.0,You Can't Take It With You,Comedy,4.012987,77.0,1938
676575,,,,,,,,,411.0,You So Crazy,Comedy,2.615385,13.0,1994
677228,,,,,,,,,117.0,"Young Poisoner's Handbook, The",Crime,3.632911,79.0,1995
679942,,,,,,,,,3223.0,"Zed & Two Noughts, A",Drama,3.413793,29.0,1985


In [18]:

# Drop index with primaryTitle and Title equal NaN
merged = merged.dropna(subset=['primaryTitle', 'Title'], how='all')

# Calculate total of votes for a movie
merged['totalVotes'] = merged['numVotes'].fillna(0) + merged['imbwNumVotes'].fillna(0)

# Calculate the average of the averages of ratings
merged['averageRating'] = ((merged['imbwAverageRating'].fillna(0) * merged['imbwNumVotes'].fillna(0) + 
                                merged['averageRating'].fillna(0) * merged['numVotes'].fillna(0)) / 
                                merged['totalVotes'].replace(0, 1))  # Replace totalVotes 0 with 1 to avoid division by zero

merged['Title'].fillna(merged['primaryTitle'], inplace=True)
merged['Year'].fillna(merged['startYear'], inplace=True)

# Drop unnecessary columns
merged.drop(['numVotes', 'imbwNumVotes', 'imbwAverageRating', 'averageRating', 'startYear', 'primaryTitle', 'originalTitle'], axis=1, inplace=True)
merged 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['totalVotes'] = merged['numVotes'].fillna(0) + merged['imbwNumVotes'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['average'] = ((merged['imbwAverageRating'].fillna(0) * merged['imbwNumVotes'].fillna(0) +
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[c

Unnamed: 0,tconst,runtimeMinutes,imbwGenres,MovieId,Title,Genres,Year,totalVotes,average
361911,tt0000009,45,Romance,,Miss Jerry,,1894,212.0,2.700000
544312,tt0000147,100,"Documentary,News,Sport",,The Corbett-Fitzsimmons Fight,,1897,517.0,2.600000
81934,tt0000502,100,\N,,Bohemios,,1905,17.0,2.200000
596625,tt0000574,70,"Action,Adventure,Biography",,The Story of the Kelly Gang,,1906,909.0,3.000000
585583,tt0000591,90,Drama,,The Prodigal Son,,1907,25.0,2.850000
...,...,...,...,...,...,...,...,...,...
676304,,,,1934.0,You Can't Take It With You,Comedy,1938,77.0,4.012987
676575,,,,411.0,You So Crazy,Comedy,1994,13.0,2.615385
677228,,,,117.0,"Young Poisoner's Handbook, The",Crime,1995,79.0,3.632911
679942,,,,3223.0,"Zed & Two Noughts, A",Drama,1985,29.0,3.413793


In [19]:
def add_imbwGenres_to_Genres(row):
    imbw_genres = str(row['imbwGenres']).split(',') if pd.notna(row['imbwGenres']) else []
    genres = str(row['Genres']).split('|') if pd.notna(row['Genres']) else []
    
    # Add genres from imbwGenres that are not already in Genres
    for genre in imbw_genres:
        if genre.strip() not in genres:
            genres.append(genre.strip())
    
    # Combine genres back into a single string separated by '|'
    return '|'.join(genres)

# Apply the function to each row in the DataFrame
merged['Genres'] = merged.apply(add_imbwGenres_to_Genres, axis=1)
merged = merged.drop(['imbwGenres'], axis=1)
merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['Genres'] = merged.apply(add_imbwGenres_to_Genres, axis=1)


Unnamed: 0,tconst,runtimeMinutes,MovieId,Title,Genres,Year,totalVotes,average
361911,tt0000009,45,,Miss Jerry,Romance,1894,212.0,2.700000
544312,tt0000147,100,,The Corbett-Fitzsimmons Fight,Documentary|News|Sport,1897,517.0,2.600000
81934,tt0000502,100,,Bohemios,\N,1905,17.0,2.200000
596625,tt0000574,70,,The Story of the Kelly Gang,Action|Adventure|Biography,1906,909.0,3.000000
585583,tt0000591,90,,The Prodigal Son,Drama,1907,25.0,2.850000
...,...,...,...,...,...,...,...,...
676304,,,1934.0,You Can't Take It With You,Comedy,1938,77.0,4.012987
676575,,,411.0,You So Crazy,Comedy,1994,13.0,2.615385
677228,,,117.0,"Young Poisoner's Handbook, The",Crime,1995,79.0,3.632911
679942,,,3223.0,"Zed & Two Noughts, A",Drama,1985,29.0,3.413793


In [20]:
merged[merged['Title'] == 'Toy Story']

Unnamed: 0,tconst,runtimeMinutes,MovieId,Title,Genres,Year,totalVotes,average
621032,tt0114709,81,1.0,Toy Story,Animation|Children's|Comedy|Adventure,1995,1075947.0,4.149994


## Feature Engineering

In [21]:
features = merged[['Title', 'Genres', 'average', 'MovieId']]
features

Unnamed: 0,Title,Genres,average,MovieId
361911,Miss Jerry,Romance,2.700000,
544312,The Corbett-Fitzsimmons Fight,Documentary|News|Sport,2.600000,
81934,Bohemios,\N,2.200000,
596625,The Story of the Kelly Gang,Action|Adventure|Biography,3.000000,
585583,The Prodigal Son,Drama,2.850000,
...,...,...,...,...
676304,You Can't Take It With You,Comedy,4.012987,1934.0
676575,You So Crazy,Comedy,2.615385,411.0
677228,"Young Poisoner's Handbook, The",Crime,3.632911,117.0
679942,"Zed & Two Noughts, A",Drama,3.413793,3223.0


In [22]:
user_item_matrix = df_rating.pivot(index='UserId', columns='MovieId', values='Rating').fillna(0)
user_item_matrix

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
unique_genres = set()

# Iterate over the 'Genres' column
for genres in features['Genres']:
    if genres != '\\N':  # Check for null genre representation
        # Split the genres by '|' and add them to the set
        genre_list = genres.split('|')
        unique_genres.update(genre_list)
unique_genres

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 '\\N'}

In [24]:
toto = df_items[df_matrix[1].notna()]
toto

  toto = df_items[df_matrix[1].notna()]


Unnamed: 0_level_0,Title,Genres,averageRating,numVotes,Year
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,Animation|Children's|Comedy,4.146846,2077.0,1995
6,Heat,Action|Crime|Thriller,3.878723,940.0,1995
8,Tom and Huck,Adventure|Children's,3.014706,68.0,1995
9,Sudden Death,Action,2.656863,102.0,1995
10,GoldenEye,Action|Adventure|Thriller,3.540541,888.0,1995
...,...,...,...,...,...
3943,Bamboozled,Comedy,3.052083,96.0,2000
3945,Digimon: The Movie,Adventure|Animation|Children's,1.488372,43.0,2000
3946,Get Carter,Action|Drama|Thriller,2.260000,100.0,2000
3948,Meet the Parents,Comedy,3.635731,862.0,2000


In [25]:
genre_counts = {}
for genres_str in toto['Genres']:
    genres = genres_str.split('|')
    for genre in genres:
        if genre in genre_counts:
            genre_counts[genre] += 1
        else:
            genre_counts[genre] = 1
genre_counts
genre_freq_df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Frequency'])
genre_freq_df = genre_freq_df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

print("Ranking of genres by frequency:")
print(genre_freq_df)

Ranking of genres by frequency:
          Genre  Frequency
0         Drama        552
1        Comedy        392
2       Romance        176
3        Action        171
4      Thriller        157
5        Horror        117
6     Adventure        102
7        Sci-Fi         94
8    Children's         91
9         Crime         69
10  Documentary         46
11      Musical         44
12          War         44
13    Animation         42
14      Mystery         30
15      Western         25
16      Fantasy         20
17    Film-Noir         16


In [98]:
import pandas as pd

def get_genre_rating(user_id, genre, user_ratings, movie_features):
    # 1. Filter ratings for User 1
    user_ratings = user_ratings[user_ratings['UserId'] == user_id]
    
    # 2. Filter movie features for movies rated by User 1 and in the specified genre
    rated_movies = user_ratings['MovieId'].unique()
    genre_ratings = user_ratings[user_ratings['MovieId'].isin(movie_features.index) & (movie_features[genre] == 1)]

    # 3. Calculate the average rating
    if len(genre_ratings) > 0:
        avg_rating = genre_ratings['Rating'].mean()
        return avg_rating
    else:
        return None  # Return None if there are no ratings for the specified genre

# Example usage:
user_id = 1  # Replace with the actual user ID
genre = 'Drama'  # Replace with the genre you want to get the rating for
rating = get_genre_rating(user_id, genre, df_rating, features)

if rating is not None:
    print(f"Average rating of {genre} genre for User {user_id}: {rating:.2f}")
else:
    print(f"No ratings found for {genre} genre for User {user_id}.")


Average rating of Drama genre for User 1: 4.17


  genre_ratings = user_ratings[user_ratings['MovieId'].isin(movie_features.index) & (movie_features[genre] == 1)]


In [27]:
for genre in unique_genres:
    features[genre] = 0
features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[genre] = 0


Unnamed: 0,Title,Genres,average,MovieId,Animation,Adventure,Biography,Horror,Mystery,Children's,...,History,Comedy,Drama,Family,Crime,Western,Action,Musical,Music,Sci-Fi
361911,Miss Jerry,Romance,2.700000,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
544312,The Corbett-Fitzsimmons Fight,Documentary|News|Sport,2.600000,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81934,Bohemios,\N,2.200000,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
596625,The Story of the Kelly Gang,Action|Adventure|Biography,3.000000,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
585583,The Prodigal Son,Drama,2.850000,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676304,You Can't Take It With You,Comedy,4.012987,1934.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
676575,You So Crazy,Comedy,2.615385,411.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
677228,"Young Poisoner's Handbook, The",Crime,3.632911,117.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
679942,"Zed & Two Noughts, A",Drama,3.413793,3223.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
for index, row in features.iterrows():
    if row['Genres'] != '\\N':  # Exclude missing or null genre representation
        for genre in row['Genres'].split('|'):
            features.at[index, genre] = 1

# Drop the original 'Genres' column as it's now redundant
features.drop('Genres', axis=1, inplace=True)
features

Unnamed: 0,Title,average,MovieId,Animation,Adventure,Biography,Horror,Mystery,Children's,Romance,...,History,Comedy,Drama,Family,Crime,Western,Action,Musical,Music,Sci-Fi
361911,Miss Jerry,2.700000,,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
544312,The Corbett-Fitzsimmons Fight,2.600000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81934,Bohemios,2.200000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
596625,The Story of the Kelly Gang,3.000000,,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
585583,The Prodigal Son,2.850000,,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676304,You Can't Take It With You,4.012987,1934.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
676575,You So Crazy,2.615385,411.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
677228,"Young Poisoner's Handbook, The",3.632911,117.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
679942,"Zed & Two Noughts, A",3.413793,3223.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [29]:
features['MovieId'] = features.index.where(features['MovieId'].isna(),features['MovieId'])
features

Unnamed: 0,Title,average,MovieId,Animation,Adventure,Biography,Horror,Mystery,Children's,Romance,...,History,Comedy,Drama,Family,Crime,Western,Action,Musical,Music,Sci-Fi
361911,Miss Jerry,2.700000,361911.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
544312,The Corbett-Fitzsimmons Fight,2.600000,544312.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81934,Bohemios,2.200000,81934.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
596625,The Story of the Kelly Gang,3.000000,596625.0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
585583,The Prodigal Son,2.850000,585583.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676304,You Can't Take It With You,4.012987,5934.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
676575,You So Crazy,2.615385,4411.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
677228,"Young Poisoner's Handbook, The",3.632911,4117.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
679942,"Zed & Two Noughts, A",3.413793,7223.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Model Development

In [88]:
from surprise import Dataset, Reader

# Préparer les données pour Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_rating[['UserId', 'MovieId', 'Rating']], reader)


In [89]:
from surprise import SVD
from surprise.model_selection import train_test_split

# Diviser les données en ensembles d'entraînement et de test
trainset, testset = train_test_split(data, test_size=0.25)

# Créer et entraîner le modèle SVD
svd_model = SVD()
svd_model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x31efd2900>

In [90]:
# Extraire les colonnes de genres
genres_columns = features.columns[3:]  # Adaptez selon votre dataset

# Extraire les informations de genres
genres_matrix = features[genres_columns].values
movie_ids = features['MovieId'].values

# Créer un dictionnaire pour accéder aux genres par identifiant de film
movie_genres_dict = {movie_id: genres for movie_id, genres in zip(movie_ids, genres_matrix)}


In [91]:
movie_genres_dict

{361911.0: array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]),
 544312.0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]),
 81934.0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]),
 596625.0: array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0]),
 585583.0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0]),
 457784.0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0]),
 7598.0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0]),
 6820.0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0]),
 5411.0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
   

In [92]:
# Exemple pour un utilisateur spécifique (UserId = 1)
user_id = 1
user_inner_id = trainset.to_inner_uid(user_id)
user_factors = svd_model.pu[user_inner_id]

# Afficher les facteurs latents de l'utilisateur
print(f"Facteurs latents pour l'utilisateur {user_id}: {user_factors}")


Facteurs latents pour l'utilisateur 1: [-0.11113723  0.19259714  0.07410131  0.04914562 -0.19531097  0.0520764
  0.07953712 -0.024191   -0.00797757 -0.02773305 -0.02382952 -0.11848542
  0.10611826  0.12719024  0.07973841 -0.10474305  0.06720871  0.17195553
 -0.03421885  0.01671937  0.11104791  0.12488924  0.05428811  0.04703449
 -0.11318042 -0.10093186  0.05266714 -0.09235964  0.21396999  0.09297469
  0.10990488  0.12535129 -0.00254018  0.13770656  0.08349985 -0.03397628
  0.00956869 -0.2300415  -0.01215072  0.07850222 -0.16628845 -0.04683317
 -0.04785337  0.06123099 -0.13639701  0.08887471 -0.00029502  0.02276967
  0.00156746 -0.13904819 -0.08598821  0.05021534 -0.0761163   0.0098778
  0.11831148  0.14035018  0.08319075  0.11663397 -0.0260649  -0.21891903
  0.01746093  0.10029859 -0.23263285  0.05244352 -0.086315    0.04713341
 -0.01306203 -0.01447563 -0.22274286  0.1349524  -0.16454006  0.0585534
  0.04487981 -0.22673838  0.16647942  0.14633913  0.02285595  0.00296587
  0.07195332 -0

In [94]:
# Filtrer les films notés par l'utilisateur
user_ratings = df_rating[df_rating['UserId'] == user_id]

# Calculer une moyenne pondérée des genres pour cet utilisateur
user_genre_preferences = user_ratings.merge(features, on='MovieId')[genres_columns].multiply(user_ratings['Rating'], axis=0).mean()

# Normaliser les préférences de genre
user_genre_preferences /= user_genre_preferences.sum()

print(f"Préférences de genre pour l'utilisateur {user_id}: {user_genre_preferences}")


Préférences de genre pour l'utilisateur 1: Animation      0.000000
Adventure      0.000000
Biography      0.012232
Horror         0.051988
Mystery        0.015291
Children's     0.000000
Romance        0.088685
Adult          0.009174
Talk-Show      0.000000
News           0.021407
War            0.051988
Fantasy        0.021407
Documentary    0.195719
\N             0.000000
Sport          0.009174
Thriller       0.051988
Film-Noir      0.000000
Reality-TV     0.000000
Game-Show      0.000000
History        0.070336
Comedy         0.085627
Drama          0.211009
Family         0.009174
Crime          0.024465
Western        0.000000
Action         0.024465
Musical        0.000000
Music          0.015291
Sci-Fi         0.030581
dtype: float64


In [96]:
import numpy as np

# Calculer une prédiction pour chaque film
predicted_ratings = []

for movie_id in features['MovieId'].values:
    try:
        inner_movie_id = trainset.to_inner_iid(movie_id)
        movie_factors = svd_model.qi[inner_movie_id]
        
        # Calculer la note prédit par la factorisation matricielle
        latent_rating = np.dot(user_factors, movie_factors)
        
        # Ajouter une composante en fonction des genres
        genre_vector = movie_genres_dict.get(movie_id, np.zeros(len(genres_columns)))
        genre_score = np.dot(user_genre_preferences, genre_vector)
        
        # Combiner les scores latents et de genres
        combined_score = latent_rating + genre_score
        
        predicted_ratings.append((movie_id, combined_score))
    except:
        # Passer les films non évalués dans l'ensemble d'entraînement
        pass

# Trier les films par score prédit
recommended_movies = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:10]
print("Recommandations de films pour l'utilisateur {}: {}".format(user_id, recommended_movies))


Recommandations de films pour l'utilisateur 1: [(1021.0, 0.8583198549541011), (551.0, 0.7841806864479721), (62.0, 0.7435748125836078), (1265.0, 0.7331657471661177), (3624.0, 0.7042359363215234), (2565.0, 0.6917758856862042), (3203.0, 0.6839947147627977), (2723.0, 0.6825872655258272), (1049.0, 0.6737118002686686), (2067.0, 0.6732375507803161)]


In [97]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import numpy as np

# Charger les données de films
movies_df = pd.read_csv('path/to/movies.csv')
# Charger les données de notations des utilisateurs
ratings_df = pd.read_csv('path/to/ratings.csv')

# Préparer les données pour Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['UserId', 'MovieId', 'Rating']], reader)

# Diviser les données en ensembles d'entraînement et de test
trainset, testset = train_test_split(data, test_size=0.25)

# Créer et entraîner le modèle SVD
svd_model = SVD()
svd_model.fit(trainset)

# Colonnes de genres
genres_columns = movies_df.columns[3:]
genres_matrix = movies_df[genres_columns].values
movie_ids = movies_df['MovieId'].values
movie_genres_dict = {movie_id: genres for movie_id, genres in zip(movie_ids, genres_matrix)}

# Recommandations pour un utilisateur spécifique (UserId = 1)
user_id = 1
user_inner_id = trainset.to_inner_uid(user_id)
user_factors = svd_model.pu[user_inner_id]

# Calculer une moyenne pondérée des genres pour cet utilisateur
user_ratings = ratings_df[ratings_df['UserId'] == user_id]
user_genre_preferences = user_ratings.merge(movies_df, on='MovieId')[genres_columns].multiply(user_ratings['Rating'], axis=0).mean()
user_genre_preferences /= user_genre_preferences.sum()

# Faire des recommandations
predicted_ratings = []

for movie_id in movies_df['MovieId'].values:
    try:
        inner_movie_id = trainset.to_inner_iid(movie_id)
        movie_factors = svd_model.qi[inner_movie_id]
        
        latent_rating = np.dot(user_factors, movie_factors)
        genre_vector = movie_genres_dict.get(movie_id, np.zeros(len(genres_columns)))
        genre_score = np.dot(user_genre_preferences, genre_vector)
        
        combined_score = latent_rating + genre_score
        predicted_ratings.append((movie_id, combined_score))
    except:
        pass

recommended_movies = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:10]
print(f"Recommandations de films pour l'utilisateur {user_id}: {recommended_movies}")


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/movies.csv'