# Movie genres, titles and user ratings Association Rule Mining

## Preparation
Inserting data and importing all necessary libraries.

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import nltk
from nltk.corpus import stopwords

nltk.data.path.append('D:\\Environment\\nltk_data')

In [10]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
ratings.drop('timestamp', axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


## First test - are movies genres associated?

### Preparation of data

In [13]:
binarizer = MultiLabelBinarizer()
genres = [i.split('|') for i in movies.genres]
genres_bin = binarizer.fit_transform(genres)
genres_pd = pd.DataFrame(genres_bin, columns = binarizer.classes_)
genres_pd = genres_pd.applymap(lambda x: True if x == 1 else False)

  genres_pd = genres_pd.applymap(lambda x: True if x == 1 else False)


### Output

In [14]:
frequent_itemsets = apriori(genres_pd, min_support=0.005, use_colnames=True)
association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(War),(Drama),0.039212,0.447649,0.030589,0.780105,1.742669,0.013036,2.51188,0.44356
1,"(Action, Children)",(Adventure),0.007904,0.129645,0.005543,0.701299,5.409384,0.004518,2.913798,0.82163
2,"(Adventure, Thriller)",(Action),0.020427,0.187641,0.016116,0.788945,4.20454,0.012283,3.849034,0.778055
3,"(Action, War)",(Drama),0.013139,0.447649,0.009341,0.710938,1.588157,0.003459,1.910835,0.37527
4,"(Sci-Fi, IMAX)",(Action),0.006364,0.187641,0.005235,0.822581,4.383797,0.004041,4.57875,0.776831
5,"(Action, Mystery)",(Thriller),0.008417,0.194416,0.005954,0.707317,3.638164,0.004317,2.752412,0.731291
6,"(Musical, Animation)",(Children),0.007083,0.068158,0.005748,0.811594,11.907456,0.005266,4.945928,0.922553
7,"(Mystery, Horror)",(Thriller),0.013652,0.194416,0.010675,0.781955,4.022072,0.008021,3.694575,0.761772
8,"(Adventure, Animation, Comedy)",(Children),0.011907,0.068158,0.00893,0.75,11.003765,0.008119,3.727366,0.920078
9,"(Crime, Drama, Mystery)",(Thriller),0.00893,0.194416,0.006467,0.724138,3.724684,0.004731,2.920242,0.738113


## Second test - are there associations between words of the title and the genres of a movie?

### Preparation of data

In [15]:
def transformTokens(token):
    if not token[-1::].isalpha():
        return token[:-1]
    return token

stop_words = stopwords.words('english')
print(stop_words)

titles = movies.title
titles = titles.map(lambda x: [transformTokens(i.strip()).lower() for i in x.strip().split(' ')[:-1]])
for index, title in enumerate(titles):
    titles[index] = [word for word in title if word not in stop_words]
titlesTokenized = pd.DataFrame(binarizer.fit_transform(titles), columns=['empty_char' if i == '' else i for i in binarizer.classes_])
titles

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

0                            [toy, story]
1                               [jumanji]
2                    [grumpier, old, men]
3                       [waiting, exhale]
4               [father, bride, part, ii]
                      ...                
9737      [black, butler, book, atlantic]
9738                   [game, life, zero]
9739                              [flint]
9740    [bungo, stray, dogs, dead, apple]
9741    [andrew, dice, clay, dice, rules]
Name: title, Length: 9742, dtype: object

In [16]:
titlesMergedGenres = pd.concat([genres_pd, titlesTokenized], axis=1)
titlesMergedGenres

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,åmål,æbler,è,é,échappé,époque,étage,étrange,été,über
0,False,False,True,True,True,True,False,False,False,True,...,0,0,0,0,0,0,0,0,0,0
1,False,False,True,False,True,False,False,False,False,True,...,0,0,0,0,0,0,0,0,0,0
2,False,False,False,False,False,True,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,False,False,False,False,False,True,False,False,True,False,...,0,0,0,0,0,0,0,0,0,0
4,False,False,False,False,False,True,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,False,True,False,True,False,True,False,False,False,True,...,0,0,0,0,0,0,0,0,0,0
9738,False,False,False,True,False,True,False,False,False,True,...,0,0,0,0,0,0,0,0,0,0
9739,False,False,False,False,False,False,False,False,True,False,...,0,0,0,0,0,0,0,0,0,0
9740,False,True,False,True,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0


### Output and display option change

In [17]:
frequent_itemsets = apriori(titlesMergedGenres, min_support=0.004, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.4)
rules



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.573940,0.038289,1.571224,0.702576
1,(IMAX),(Action),0.016218,0.187641,0.009546,0.588608,3.136879,0.006503,1.974657,0.692442
2,(Sci-Fi),(Action),0.100595,0.187641,0.046294,0.460204,2.452576,0.027419,1.504937,0.658508
3,(ii),(Action),0.010470,0.187641,0.004209,0.401961,2.142178,0.002244,1.358371,0.538827
4,(Animation),(Adventure),0.062718,0.129645,0.025354,0.404255,3.118175,0.017223,1.460953,0.724755
...,...,...,...,...,...,...,...,...,...,...
118,"(Fantasy, Children, Animation)",(Comedy),0.009444,0.385547,0.004003,0.423913,1.099510,0.000362,1.066597,0.091367
119,"(Fantasy, Children, Comedy)",(Animation),0.009546,0.062718,0.004003,0.419355,6.686342,0.003405,1.614208,0.858638
120,"(Fantasy, Animation, Comedy)",(Children),0.006672,0.068158,0.004003,0.600000,8.803012,0.003549,2.329604,0.892356
121,"(Crime, Drama, Mystery)",(Thriller),0.008930,0.194416,0.006467,0.724138,3.724684,0.004731,2.920242,0.738113


### Filtered output

In [18]:
connected_list = []
for row in titles:
    for word in row:
        connected_list.append(word)

In [19]:
mask = rules['antecedents'].apply(lambda x: any(item in connected_list for item in x)) | \
       rules['consequents'].apply(lambda x: any(item in connected_list for item in x))

filtered_rules = rules[mask]

filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
3,(ii),(Action),0.01047,0.187641,0.004209,0.401961,2.142178,0.002244,1.358371,0.538827
11,(movie),(Animation),0.009546,0.062718,0.004927,0.516129,8.229344,0.004328,1.937049,0.886951
16,(2),(Comedy),0.008622,0.385547,0.004003,0.464286,1.204226,0.000679,1.146979,0.171066
17,(love),(Comedy),0.010675,0.385547,0.006569,0.615385,1.596133,0.002454,1.597577,0.377516
18,(man),(Comedy),0.01745,0.385547,0.007185,0.411765,1.068001,0.000458,1.04457,0.064802
19,(movie),(Comedy),0.009546,0.385547,0.005338,0.55914,1.45025,0.001657,1.393759,0.313456
28,((a.k.a),(Drama),0.009649,0.447649,0.004311,0.446809,0.998122,-8e-06,0.99848,-0.001897
29,(de),(Drama),0.006775,0.447649,0.004824,0.712121,1.590801,0.001792,1.918692,0.373919
30,(la),(Drama),0.007288,0.447649,0.004619,0.633803,1.415847,0.001357,1.508342,0.295865
31,(love),(Drama),0.010675,0.447649,0.00503,0.471154,1.052506,0.000251,1.044445,0.050425


### Display option change to default

In [20]:
pd.set_option('display.max_rows', 10)

## Third test - do genres influence the rating of a movie?

### Preparation of data

In [21]:
df = pd.DataFrame()
min_ratings = [2.0, 3.0, 3.5, 4.0]
for min_rating in min_ratings:
    movieAverageRatingPd = ratings.groupby('movieId').mean().drop(['userId'], axis=1)
    movieAverageRatingPd
    moviesWithGenres = pd.concat([movies.movieId, genres_pd], axis=1)
    genresWithRatings = pd.merge(how='right', on='movieId', left=moviesWithGenres, right=movieAverageRatingPd).drop(['movieId'], axis=1)
    genresWithRatings[f'Average rating better than {min_rating}'] = genresWithRatings['rating'].map(lambda x: x > min_rating)
    genresWithRatings[f'Average rating worse than {min_rating}'] = genresWithRatings['rating'].map(lambda x: x <= min_rating)
    genresWithRatings.drop(['rating'], axis=1, inplace=True)
    frequent_itemsets = apriori(genresWithRatings, min_support=0.1, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)
    df = pd.concat([df, rules])

df.reset_index(drop=True, inplace=True)
pd.set_option('display.max_rows', None)
df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Action),(Average rating better than 2.0),0.187988,0.889037,0.163924,0.871991,0.980826,-0.003204,0.866836,-0.023508
1,(Adventure),(Average rating better than 2.0),0.129782,0.889037,0.117339,0.90412,1.016966,0.001958,1.157312,0.019171
2,(Comedy),(Average rating better than 2.0),0.385952,0.889037,0.338955,0.878231,0.987845,-0.004171,0.911253,-0.019646
3,(Crime),(Average rating better than 2.0),0.122995,0.889037,0.111888,0.909699,1.02324,0.002541,1.228808,0.025898
4,(Drama),(Average rating better than 2.0),0.447244,0.889037,0.414439,0.92665,1.042307,0.016822,1.512778,0.073431
5,(Romance),(Average rating better than 2.0),0.163616,0.889037,0.152304,0.930861,1.047044,0.006843,1.604922,0.053719
6,(Thriller),(Average rating better than 2.0),0.194262,0.889037,0.171946,0.885124,0.995599,-0.00076,0.965937,-0.005457
7,(Drama),(Average rating better than 3.0),0.447244,0.622069,0.317565,0.710048,1.14143,0.039348,1.303427,0.22416
8,(Action),(Average rating worse than 3.5),0.187988,0.612505,0.132044,0.702407,1.146777,0.016901,1.302097,0.157622
9,(Action),(Average rating worse than 4.0),0.187988,0.870424,0.172974,0.920131,1.057107,0.009344,1.622366,0.066529


## Fourth test - are there associations between genres liked by users?

### Preparation of data

In [22]:
like_movie_threshold = 4.5
like_genre_threshold = 0.2
didUserLikeMovie = pd.DataFrame(ratings)
didUserLikeMovie['like'] = didUserLikeMovie['rating'].map(lambda x: x>=like_movie_threshold)
didUserLikeMovie.drop(['rating'], axis=1, inplace=True)
didUserLikeMovie = didUserLikeMovie[didUserLikeMovie['like'] == True].drop('like', axis=1)
userLikesGenre = pd.merge(left=didUserLikeMovie, right=moviesWithGenres, how='left', on='movieId').drop(['movieId', '(no genres listed)'], axis=1)
userLikesGenre = userLikesGenre.groupby(by='userId').mean().applymap(lambda x: x > like_genre_threshold)


  userLikesGenre = userLikesGenre.groupby(by='userId').mean().applymap(lambda x: x > like_genre_threshold)


### Output

In [23]:
frequent_itemsets = apriori(userLikesGenre, min_support=0.35, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.75)
pd.set_option('display.max_rows', None)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Adventure),(Action),0.528428,0.610368,0.431438,0.816456,1.337645,0.108903,2.122823,0.535269
1,(Action),(Drama),0.610368,0.913043,0.543478,0.890411,0.975212,-0.013814,0.793478,-0.061241
2,(Action),(Thriller),0.610368,0.628763,0.464883,0.761644,1.211338,0.081106,1.55749,0.447772
3,(Adventure),(Drama),0.528428,0.913043,0.464883,0.879747,0.963532,-0.017595,0.723112,-0.074296
4,(Comedy),(Drama),0.69398,0.913043,0.625418,0.901205,0.987034,-0.008216,0.88017,-0.04116
5,(Crime),(Drama),0.4699,0.913043,0.453177,0.964413,1.056262,0.024138,2.443478,0.100481
6,(Crime),(Thriller),0.4699,0.628763,0.382943,0.814947,1.296112,0.087488,2.00611,0.430978
7,(Thriller),(Drama),0.628763,0.913043,0.576923,0.917553,1.004939,0.002836,1.054698,0.013239
8,"(Action, Adventure)",(Drama),0.431438,0.913043,0.377926,0.875969,0.959395,-0.015995,0.701087,-0.069283
9,"(Adventure, Drama)",(Action),0.464883,0.610368,0.377926,0.81295,1.331901,0.094177,2.083033,0.46568


## Creators

* Kajetan Sulwiński (ekohachi22)
* Mikołaj Marmurowicz (Mickeyo0o)