In [110]:
import pandas as pd

from math import sqrt


### First we get familiar with the Movielens dataset  

In [84]:
movies_data = pd.read_csv(r"E:\MAID\ADSA_pro\movies.csv")
print(movies_data.shape)
# movies_data.info()
movies_data.head(7)

(62423, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance


In [85]:
ratings_data = pd.read_csv(r"E:\MAID\ADSA_pro\ratings.csv")
print(ratings_data.shape)
# ratings_data.info()
ratings_data.head(5)

(25000095, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [86]:
tags = pd.read_csv(r"E:\MAID\ADSA_pro\tags.csv")
print(tags.shape)
# tags.info()
tags.head(7)

(1093360, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455
5,4,44665,unreliable narrators,1573943619
6,4,115569,tense,1573943077


In [87]:
links = pd.read_csv(r"E:\MAID\ADSA_pro\links.csv")
print(links.shape)
# # links.info()
links.head(7)

(62423, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0


In [88]:
genome_tags = pd.read_csv(r"E:\MAID\ADSA_pro\genome-tags.csv")
print(genome_tags.shape)
# genome_tags.info()
genome_tags.head(7)

(1128, 2)


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
5,6,1950s
6,7,1960s


In [89]:
genome_scores = pd.read_csv(r"E:\MAID\ADSA_pro\genome-scores.csv")
print(genome_scores.shape)
# genome_scores.info()
genome_scores.head(7)

(15584448, 3)


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075
5,1,6,0.14675
6,1,7,0.0635


### Performing data preprocessing  

Preparing data to be easy to access and faster to compute the required tasks:

In [90]:
movies_list = movies_data.sort_values(by=['title'])
movies_list['movieId']=movies_list.movieId.astype('category')
movies_list.head()

Unnamed: 0,movieId,title,genres
62216,208297,"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",Documentary|Drama
11420,51372,"""Great Performances"" Cats (1998)",Musical
30582,136604,#1 Cheerleader Camp (2010),Comedy|Drama
51787,183901,#Captured (2017),Horror
57250,195955,#Female Pleasure (2018),Documentary


In [91]:
movies_stats = pd.merge(movies_data,ratings_data.groupby('movieId').rating.mean(),on="movieId")
movies_stats = pd.merge(movies_stats,ratings_data.groupby("movieId").userId.count(),on="movieId")
movies_stats.rename(columns={'rating':'average_rating'},inplace=True)
movies_stats.rename(columns={'userId':'ratings_count'},inplace=True)

In [92]:
movies_stats.head()

Unnamed: 0,movieId,title,genres,average_rating,ratings_count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708,57309
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527,24228
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028,11804
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547,2523
4,5,Father of the Bride Part II (1995),Comedy,3.058434,11714


In [93]:
print(movies_stats.shape)

(59047, 5)


In [94]:
#applying laplace averaging rule
movies_stats['average'] = ((movies_stats.average_rating*movies_stats.ratings_count) + 5 + 1)/(movies_stats.ratings_count+2) # laplace average
movies_stats.drop(['average_rating'],axis=1,inplace=True)


In [95]:
movies_stats.head()

Unnamed: 0,movieId,title,genres,ratings_count,average
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,57309,3.893677
1,2,Jumanji (1995),Adventure|Children|Fantasy,24228,3.251506
2,3,Grumpier Old Men (1995),Comedy|Romance,11804,3.142004
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2523,2.853663
4,5,Father of the Bride Part II (1995),Comedy,11714,3.058424


In [96]:
movies_ratings = pd.read_csv('E:\MAID\ADSA_pro\code\Ratings.csv')
movies_ratings = movies_ratings.drop(columns='timestamp')
popular_movies = movies_ratings[movies_ratings.rating>=4]




In [109]:
# Checking dimensions of our new preprocessed tables
print(movies_list.shape)
print(popular_movies.shape)
print(movies_stats.shape)

(62423, 3)
(12452811, 3)
(59047, 5)


##### Saving the final tables into csv to use for data creation 

In [None]:
# movies_stats.to_csv('E:\MAID\ADSA_pro\code\movies_stats.csv', encoding='utf-8', index=False)
# popular_movies.to_csv('E:\MAID\ADSA_pro\code\popular_movies.csv', encoding='utf-8', index=False)
# movies_list.to_csv('E:\MAID\ADSA_pro\code\movies_list.csv', encoding='utf-8', index=False)
