In [76]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor

from src.experiments.utils.evaluation import relative_squared_error

# Load the data

In [77]:
base_path = "."
destination_data_h5_path = os.path.join(base_path, "..", "datasets", "MovieLens.h5")
source_path_kaggle = os.path.join(base_path, "data", "MovieLens", "kaggle_case_study")
source_data_ratings = os.path.join(source_path_kaggle, "ratings.csv")
source_data_movies = os.path.join(source_path_kaggle, "movies.csv")
source_data_users = os.path.join(source_path_kaggle, "users.csv")

In [78]:
ratings = pd.read_csv(source_data_ratings, names=['userId', 'movieId', 'rating', 'timestamp'], delimiter="::", encoding='latin1')
print("Shape:", ratings.shape)
ratings.head()

  ratings = pd.read_csv(source_data_ratings, names=['userId', 'movieId', 'rating', 'timestamp'], delimiter="::", encoding='latin1')


Shape: (1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [79]:
movies = pd.read_csv(source_data_movies, names=['movieId', 'title', 'genres'], delimiter="::", encoding='latin1')
print("Shape:", movies.shape)
movies.head()

Shape: (3883, 3)


  movies = pd.read_csv(source_data_movies, names=['movieId', 'title', 'genres'], delimiter="::", encoding='latin1')


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [80]:
users = pd.read_csv(source_data_users, names=['userId', 'gender', 'age', 'occupation', 'zipcode'], delimiter="::", encoding='latin1')
print("Shape:", users.shape)
users.head()

Shape: (6040, 5)


  users = pd.read_csv(source_data_users, names=['userId', 'gender', 'age', 'occupation', 'zipcode'], delimiter="::", encoding='latin1')


Unnamed: 0,userId,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [81]:
df_merged = movies.merge(ratings,on='movieId',how='inner')
print("Shape:", df_merged.shape)
df_merged.head()

Shape: (1000209, 6)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


In [82]:
df_merged = df_merged.merge(users,on='userId',how='inner')
print("Shape:", df_merged.shape)
df_merged.head()

Shape: (1000209, 10)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp,gender,age,occupation,zipcode
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008,F,50,9,55117
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496,M,25,12,11413
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952,M,25,17,61614
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474,F,35,1,95370


# Feature engineering

In [83]:
genres = df_merged['genres'].str.split("|")
listgenres = set()
for genre in genres:
    listgenres = listgenres.union(set(genre))

print("There are", len(listgenres), "genres")
listgenres

There are 18 genres


{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [84]:
genresOneHot = df_merged['genres'].str.get_dummies("|")
genresOneHot.head()

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [85]:
df_merged = pd.concat([df_merged, genresOneHot], axis=1)
df_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,gender,age,occupation,zipcode,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067,...,0,0,0,0,0,0,0,0,0,0
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008,F,50,9,55117,...,0,0,0,0,0,0,0,0,0,0
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496,M,25,12,11413,...,0,0,0,0,0,0,0,0,0,0
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952,M,25,17,61614,...,0,0,0,0,0,0,0,0,0,0
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474,F,35,1,95370,...,0,0,0,0,0,0,0,0,0,0


In [86]:
df_merged[["title","year"]] = df_merged['title'].str.extract(r"^(.*)\s\((\d{4})\)$",expand=True)
# df_merged = df_merged.drop(columns=["title"])
df_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,gender,age,occupation,zipcode,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,Toy Story,Animation|Children's|Comedy,1,5,978824268,F,1,10,48067,...,0,0,0,0,0,0,0,0,0,1995
1,1,Toy Story,Animation|Children's|Comedy,6,4,978237008,F,50,9,55117,...,0,0,0,0,0,0,0,0,0,1995
2,1,Toy Story,Animation|Children's|Comedy,8,4,978233496,M,25,12,11413,...,0,0,0,0,0,0,0,0,0,1995
3,1,Toy Story,Animation|Children's|Comedy,9,5,978225952,M,25,17,61614,...,0,0,0,0,0,0,0,0,0,1995
4,1,Toy Story,Animation|Children's|Comedy,10,5,978226474,F,35,1,95370,...,0,0,0,0,0,0,0,0,0,1995


In [87]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 29 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   movieId      1000209 non-null  int64 
 1   title        1000209 non-null  object
 2   genres       1000209 non-null  object
 3   userId       1000209 non-null  int64 
 4   rating       1000209 non-null  int64 
 5   timestamp    1000209 non-null  int64 
 6   gender       1000209 non-null  object
 7   age          1000209 non-null  int64 
 8   occupation   1000209 non-null  int64 
 9   zipcode      1000209 non-null  object
 10  Action       1000209 non-null  int64 
 11  Adventure    1000209 non-null  int64 
 12  Animation    1000209 non-null  int64 
 13  Children's   1000209 non-null  int64 
 14  Comedy       1000209 non-null  int64 
 15  Crime        1000209 non-null  int64 
 16  Documentary  1000209 non-null  int64 
 17  Drama        1000209 non-null  int64 
 18  Fantasy      1000209 n

In [88]:
# df_merged['year'] = df_merged['year'].astype(int)
# df_merged['movie_age'] = 2024 - df_merged.Year
# df_merged.head()

In [89]:
df_merged['gender'] = df_merged['gender'].replace('F','1')
df_merged['gender'] = df_merged['gender'].replace('M','0')
df_merged['gender'] = df_merged['gender'].astype(int)


In [90]:
print("Columns:", df_merged.columns)

Columns: Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp', 'gender',
       'age', 'occupation', 'zipcode', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western', 'year'],
      dtype='object')


In [91]:
X = df_merged.drop(columns=['genres','title','year','timestamp'])
# X = df_merged.drop(columns=['genres', 'title', 'timestamp'])
y = X.pop('rating')

# Build and test model

In [92]:
first_1000 = X.head(1000)
y_first_1000 = y.head(1000)

In [94]:
first_1000 = first_1000[['movieId','age','occupation']].values

In [95]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(first_1000,y_first_1000,test_size=0.33,random_state=0)

In [96]:
from sklearn.tree import DecisionTreeClassifier

# DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
Y_pred = decision_tree.predict(x_test)
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
acc_decision_tree

53.58

In [None]:

from sklearn.ensemble import GradientBoostingRegressor

# DecisionTreeClassifier
decision_tree = GradientBoostingRegressor()
decision_tree.fit(x_train, y_train)
Y_pred = decision_tree.predict(x_test)
# acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
# acc_decision_tree


In [None]:
from src.experiments.utils.evaluation import symmetric_mean_absolute_percentage_error, relative_squared_error
from sklearn.metrics import mean_absolute_percentage_error
rse = relative_squared_error(y_test, Y_pred)
mape = mean_absolute_percentage_error(y_test, Y_pred)
smape = symmetric_mean_absolute_percentage_error(y_test, Y_pred)
rse, mape, smape