# Analysis of MovieLens dataset (Beginner'sAnalysis)
https://www.kaggle.com/jneupane12/analysis-of-movielens-dataset-beginner-sanalysis

In [1]:
import sys
print(sys.executable)

E:\ProgramData\Anaconda3\python.exe


## 1. First we import necessary Libaries

In [2]:
import pandas as pd # pandas is a data manipulation library
import numpy as np #provides numerical arrays and functions to manipulate the arrays efficiently
import random
import matplotlib.pyplot as plt # data visualization library
import wordcloud #used to generate world cloud
import time
import datetime

## 2.Reading and Exploring the Data

### Load Movies Data

In [3]:
movies = pd.read_csv("datasets/movies.csv")

In [4]:
movies.sample(5)

Unnamed: 0,movieId,title,genres
4156,5984,"Story of O, The (Histoire d'O) (1975)",Drama|Romance
6717,58839,Leatherheads (2008),Comedy|Drama|Romance
1473,1998,Exorcist II: The Heretic (1977),Horror
9249,155659,Florence Foster Jenkins (2016),Comedy|Drama
6286,47810,"Wicker Man, The (2006)",Horror|Mystery|Thriller


In [5]:
movies.shape, movies.info(), len(movies['movieId'].unique().tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


((9742, 3), None, 9742)

### Load Ratings Data

In [6]:
ratings = pd.read_csv("datasets/ratings.csv")
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
5039,33,34,3.0,939646940
82204,521,852,3.0,852713356
8638,59,647,4.0,953610077
93211,599,1732,5.0,1498456250
48269,313,380,5.0,1030556462


In [8]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [9]:
ratings.shape, len(ratings['movieId'].unique().tolist())

((100836, 4), 9724)

### Load Tags Data

In [10]:
tags = pd.read_csv("datasets/tags.csv")

In [11]:
ratings.shape, len(ratings['movieId'].unique().tolist())

((100836, 4), 9724)

In [12]:
tags.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
1282,474,1084,1920s,1138137844
2090,474,6228,In Netflix queue,1137201613
992,474,21,Hollywood,1137206178
2574,477,3527,Arnold Schwarzenegger,1269832592
2452,474,34437,In Netflix queue,1137179720


## 3.Cleaning of data

In [13]:
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [14]:
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [15]:
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [16]:
# # lets drop null rows
# tags=tags.dropna()

## 4.Data Analysis

In [17]:
# https://stackoverflow.com/a/62968313/2049763
def timestamp_to_date_converter(t):
    # https://realpython.com/python-time-module/
    return datetime.datetime.fromtimestamp(t).strftime("%A, %B %d, %Y %I:%M:%S")

In [18]:
ratings['date'] = ratings['timestamp'].apply(timestamp_to_date_converter)

In [19]:
ratings.sort_values(by=['timestamp'], ascending=True).tail(5)

Unnamed: 0,userId,movieId,rating,timestamp,date
81475,514,187031,2.5,1537674927,"Saturday, September 22, 2018 10:55:27"
81477,514,187595,3.0,1537674946,"Saturday, September 22, 2018 10:55:46"
81336,514,5247,2.5,1537757040,"Sunday, September 23, 2018 09:44:00"
81335,514,5246,1.5,1537757059,"Sunday, September 23, 2018 09:44:19"
81092,514,162,4.0,1537799250,"Monday, September 24, 2018 09:27:30"


In [20]:
s = "01/01/2018"
t = time.mktime(datetime.datetime.strptime(s, "%m/%d/%Y").timetuple())

In [21]:
ratings.loc[ratings.timestamp>t].sort_values(by=['timestamp'], ascending=True).head(5)

Unnamed: 0,userId,movieId,rating,timestamp,date
7181,50,2420,2.5,1514842717,"Monday, January 01, 2018 03:38:37"
7357,50,117529,1.5,1514842744,"Monday, January 01, 2018 03:39:04"
7360,50,122904,1.5,1514891080,"Tuesday, January 02, 2018 05:04:40"
7299,50,72998,2.0,1514891142,"Tuesday, January 02, 2018 05:05:42"
7291,50,63312,2.0,1514891165,"Tuesday, January 02, 2018 05:06:05"


In [22]:
ratings.loc[ratings.timestamp>t].sort_values(by=['timestamp'], ascending=True).shape

(6413, 5)

In [23]:
drama_movies = movies['genres'].str.contains('Drama')
movies[drama_movies].head()

Unnamed: 0,movieId,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
10,11,"American President, The (1995)",Comedy|Drama|Romance
13,14,Nixon (1995),Drama
15,16,Casino (1995),Crime|Drama
16,17,Sense and Sensibility (1995),Drama|Romance


In [24]:
drama_movies.shape

(9742,)

In [25]:
tag_search = tags['tag'].str.contains('Dark', case=False)
len(tag_search)

3683

In [26]:
tags[tag_search].sample()

Unnamed: 0,userId,movieId,tag,timestamp
3130,567,57502,dark,1525285567


In [27]:
movies_ratings_data=movies.merge(ratings, on = 'movieId', how = 'inner')
movies_ratings_data.sample(3)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,date
85021,53125,Pirates of the Caribbean: At World's End (2007),Action|Adventure|Comedy|Fantasy,68,4.5,1198454572,"Sunday, December 23, 2007 06:02:52"
16445,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,477,5.0,1200939621,"Monday, January 21, 2008 12:20:21"
20947,920,Gone with the Wind (1939),Drama|Romance|War,372,3.0,874417022,"Tuesday, September 16, 1997 08:37:02"


In [28]:
#top 25 most rated movies
most_rated = movies_ratings_data.groupby('title').size().sort_values(ascending=False)
most_rated.head(10)

title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
Star Wars: Episode IV - A New Hope (1977)    251
Jurassic Park (1993)                         238
Braveheart (1995)                            237
Terminator 2: Judgment Day (1991)            224
Schindler's List (1993)                      220
dtype: int64

In [29]:
movies_ratings_data.loc[movies_ratings_data.userId == 229].sort_values(by='rating', ascending=True)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,date
6713,253,Interview with the Vampire: The Vampire Chroni...,Drama|Horror,229,3.0,836941791,"Tuesday, July 09, 1996 02:49:51"
9928,353,"Crow, The (1994)",Action|Crime|Fantasy|Thriller,229,3.0,838144000,"Tuesday, July 23, 1996 12:46:40"
11631,410,Addams Family Values (1993),Children|Comedy|Fantasy,229,3.0,836942019,"Tuesday, July 09, 1996 02:53:39"
9701,348,Bullets Over Broadway (1994),Comedy,229,3.0,838143659,"Tuesday, July 23, 1996 12:40:59"
12066,434,Cliffhanger (1993),Action|Adventure|Thriller,229,3.0,836942675,"Tuesday, July 09, 1996 03:04:35"
...,...,...,...,...,...,...,...
4132,150,Apollo 13 (1995),Adventure|Drama|IMAX,229,5.0,836941256,"Tuesday, July 09, 1996 02:40:56"
3475,110,Braveheart (1995),Action|Drama|War,229,5.0,836942019,"Tuesday, July 09, 1996 02:53:39"
15732,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,229,5.0,836942292,"Tuesday, July 09, 1996 02:58:12"
1726,34,Babe (1995),Children|Drama,229,5.0,836942064,"Tuesday, July 09, 1996 02:54:24"


### Count and Mean of ratings
https://stackoverflow.com/a/41040158

In [30]:
# userId movieId rating
temp_df = ratings.groupby('movieId').agg({'userId':'count', 'rating':'mean'}).rename(
    columns={'userId':'Lens#Rating','rating':'LensAvgRatings'}).reset_index()
temp_df

Unnamed: 0,movieId,Lens#Rating,LensAvgRatings
0,1,215,3.920930
1,2,110,3.431818
2,3,52,3.259615
3,4,7,2.357143
4,5,49,3.071429
...,...,...,...
9719,193581,1,4.000000
9720,193583,1,3.500000
9721,193585,1,3.500000
9722,193587,1,3.500000


In [31]:
movies_avg_ratings = movies.merge(temp_df, on = 'movieId', how = 'inner')
movies_avg_ratings.sample(3)

Unnamed: 0,movieId,title,genres,Lens#Rating,LensAvgRatings
2215,2946,Help! (1965),Comedy|Musical,7,3.357143
878,1173,"Cook the Thief His Wife & Her Lover, The (1989)",Comedy|Drama,11,3.136364
3743,5221,Harrison's Flowers (2000),Drama,1,3.0


### Distribution

In [32]:
import scipy.stats as st

In [35]:
# https://stackoverflow.com/a/51742444/2049763
def get_best_distribution(data):
    dist_names = ["norm", "exponweib", "weibull_max", "weibull_min", "pareto", "genextreme", 'gamma', 'beta', 'rayleigh']
    
    # https://stackoverflow.com/a/16651955/2049763
    # 'alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 
    # 'dweibull', 'erlang', 'expon', 'exponweib', 'exponpow', 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'foldnorm', 
    # 'frechet_r', 'frechet_l', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gausshyper', 'gamma', 'gengamma', 
    # 'genhalflogistic', 'gilbrat', 'gompertz', 'gumbel_r', 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'invgamma', 'invgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'ksone', 'kstwobign', 'laplace', 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'lomax', 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 'nct', 'norm', 'pareto', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rdist', 'reciprocal', 'rayleigh', 'rice', 'recipinvgauss', 'semicircular', 't', 'triang', 'truncexpon', 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'wald', 'weibull_min', 'weibull_max', 'wrapcauchy'
    dist_results = []
    params = {}
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)

        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = st.kstest(data, dist_name, args=param)
        print("p value for "+dist_name+" = "+str(p))
        dist_results.append((dist_name, p))

    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value

    print("Best fitting distribution: "+str(best_dist))
    print("Best p value: "+ str(best_p))
    print("Parameters for the best fit: "+ str(params[best_dist]))

    return best_dist, best_p, params[best_dist]

In [36]:
get_best_distribution(movies_ratings_data['rating'])

p value for norm = 0.0
p value for exponweib = 0.0
p value for weibull_max = 0.0
p value for weibull_min = 0.0
p value for pareto = 0.0
p value for genextreme = 0.0
p value for gamma = 0.0


  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)


p value for beta = 0.0
p value for rayleigh = 0.0
Best fitting distribution: norm
Best p value: 0.0
Parameters for the best fit: (3.501556983616962, 1.0425240696180562)


('norm', 0.0, (3.501556983616962, 1.0425240696180562))

## 5. Data Visualization

In [None]:
#define a function that counts the number of times each genre appear:
def count_word(df, ref_col, liste):
    keyword_count = dict()
    for s in liste: keyword_count[s] = 0
    for liste_keywords in df[ref_col].str.split('|'):
        if type(liste_keywords) == float and pd.isnull(liste_keywords): continue
        for s in liste_keywords: 
            if pd.notnull(s): keyword_count[s] += 1
    # convert the dictionary in a list to sort the keywords  by frequency
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

In [None]:
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * tone / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

In [None]:
#here we  make census of the genres:
genre_labels = set()
for s in movies['genres'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))

In [None]:
#counting how many times each of genres occur:
keyword_occurences, dum = count_word(movies, 'genres', genre_labels)
keyword_occurences

In [None]:
#Finally, the result is shown as a wordcloud:
words = dict()
trunc_occurences = keyword_occurences[0:50]
for s in trunc_occurences:
    words[s[0]] = s[1]
tone = 100 # define the color of the words
f, ax = plt.subplots(figsize=(14, 6))
wordcloud = wordcloud.WordCloud(width=550,height=300, background_color='black', 
                      max_words=1628,relative_scaling=0.7,
                      color_func = random_color_func,
                      normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# lets display the same result in the histogram
fig = plt.figure(1, figsize=(18,13))
ax2 = fig.add_subplot(2,1,2)
y_axis = [i[1] for i in trunc_occurences]
x_axis = [k for k,i in enumerate(trunc_occurences)]
x_label = [i[0] for i in trunc_occurences]

plt.xticks(rotation=85, fontsize = 15)
plt.yticks(fontsize = 15)
plt.xticks(x_axis, x_label)
plt.ylabel("No. of occurences", fontsize = 24, labelpad = 0)

ax2.bar(x_axis, y_axis, align = 'center', color='r')
plt.title("Popularity of Genres",bbox={'facecolor':'k', 'pad':5},color='w',fontsize = 30)

plt.show()

## 6. Regression 

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

In [None]:
#added some parameters
# https://stackoverflow.com/a/45116022
kf5 = KFold(n_splits = 5, shuffle = True, random_state = 2)

In [None]:
movies_avg_ratings.sample(5)

In [None]:
for train_index, test_index in kf5.split(movies_avg_ratings):
    train_x = np.array(movies_avg_ratings.iloc[train_index]['Lens#Rating']).reshape(-1, 1)
    test_x  = np.array(movies_avg_ratings.iloc[test_index]['Lens#Rating']).reshape(-1, 1)
    
    train_y =  movies_avg_ratings.iloc[train_index]['LensAvgRatings']
    test_y  =  movies_avg_ratings.iloc[test_index]['LensAvgRatings']
    
    reg = LinearRegression().fit(train_x, train_y)
    print(reg.score(train_x, train_y), reg.coef_, reg.intercept_)
    
    pred_y = reg.predict(test_x)
    
    print(reg.score(test_x, test_y))

## 7. Classification 

In [None]:
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

from sklearn.model_selection import KFold
from sklearn import svm

In [None]:
movies_vote = movies_ratings_data.copy()

movies_vote = movies_vote.drop(['timestamp'], axis=1)
movies_vote.sample(5)

In [None]:
movies_vote['vote'] = movies_vote.apply(lambda row: 1 if row.rating >= 3 else 0, axis=1)
movies_vote.sample(5)

In [None]:
movies_vote = movies_vote.drop(['rating'], axis=1)
movies_vote.sample(5)

In [None]:
genres_set = set()
global_list = []
for rows in movies_vote.itertuples():
    genres_list = rows.genres.split('|')
    global_list.append(genres_list)
    genres_set.update(genres_list)
print(list(genres_set)) 

In [None]:
# Creating MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()
one_hot.fit(global_list)

one_hot.classes_

In [None]:
movies_vote['genres_list'] = movies_vote.apply(lambda row: row.genres.split('|'), axis=1)
movies_vote.sample(5)

In [None]:
movies_vote['genres_one_hot'] = movies_vote.apply(lambda row: one_hot.transform(row.genres_list)[0], axis=1)
movies_vote.sample(5)

In [None]:
movies_vote = movies_vote.drop(['genres', 'genres_list'], axis=1)
movies_vote.sample(5)

### SVM

In [None]:
kf5 = KFold(n_splits = 5, shuffle = True, random_state = 2)

In [None]:
movies_vote.dtypes

In [None]:
movies_vote['feature'] = movies_vote.apply(lambda row: np.append(row.genres_one_hot, row.userId), axis=1)
movies_vote.sample(5)

In [None]:
for rows in movies_vote.itertuples():
    if len(row.feature) != 21:
        print(row)
        break

In [None]:
for train_index, test_index in kf5.split(movies_vote):
    train_x = np.array(movies_vote.iloc[train_index]['feature'])
    test_x  = np.array(movies_vote.iloc[test_index]['feature'])
    
    train_y =  movies_vote.iloc[train_index]['vote']
    test_y  =  movies_vote.iloc[test_index]['vote']
    
    clf = svm.SVC()
    # ValueError: setting an array element with a sequence. 
    # while using SVM in scikit-learn https://stackoverflow.com/a/53303302/2049763
    clf.fit(list(train_x), train_y)    
    pred_y = clf.predict(list(test_x))
    
    print("Accuracy:", round( sum(pred_y == test_y) * 100 / len(test_y) , 3))