# Association rules - preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

titles = pd.read_csv('../../preprocessing/titles_preprocessed.csv')

In [2]:
genres = titles['genres']
genres_split = genres.str.split(',')

titles["genres"] = titles.apply(lambda row: row["genres"].replace('[', '').replace(']', '').replace("'", "").split(", "), axis=1)
genre_df = pd.DataFrame(titles.genres.values.tolist(), titles.index).add_prefix('genre_')

titles = (titles.join(genre_df).drop(columns=["genres"]))

In [3]:
titles = titles.drop(columns=["genre_1", "genre_2", "genre_3", "genre_4", "genre_5", "genre_6", "genre_7", "genre_8"])

In [4]:
titles

Unnamed: 0,id,title,type,description,release_year,runtime,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,genre_0
0,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,78,['US'],0.0,tt0017925,8.200000,89766.000000,8.647,action
1,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,92,['US'],0.0,tt0032599,7.800000,57835.000000,11.270,comedy
2,tm19424,Detour,MOVIE,"The life of Al Roberts, a pianist in a New Yor...",1945,66,['US'],0.0,tt0037638,7.300000,17233.000000,7.757,thriller
3,tm112005,Marihuana,MOVIE,A young girl named Burma attends a beach party...,1936,57,['US'],0.0,tt0026683,4.000000,864.000000,3.748,crime
4,tm22806,Intolerance: Love's Struggle Throughout the Ages,MOVIE,"The story of a poor young woman, separated by ...",1916,197,['US'],0.0,tt0006864,7.700000,15242.000000,9.412,history
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2652,tm975981,Cher & the Loneliest Elephant,MOVIE,"""The World's Loneliest Elephant"" Kaavan will f...",2021,46,"['KH', 'PK', 'GB', 'US']",0.0,tt14303400,7.600000,237.000000,1.758,documentation
2653,ts296262,Searching for Secrets,SHOW,Journey back in time to uncover buried tales i...,2021,44,"['GB', 'US']",1.0,tt18259488,6.046369,22272.758145,0.600,documentation
2654,tm1106415,Destination Porto: The Unimaginable Journey,MOVIE,Follow soccer journalist Guillem Belagué as he...,2021,85,[],0.0,tt15430722,7.700000,8.000000,0.600,documentation
2655,ts305329,House Calls with Dr. Phil,SHOW,The talk show legend travels across the countr...,2021,43,['US'],1.0,tt15176234,5.400000,35.000000,0.600,reality


In [5]:
titles[titles["genre_0"].isna() == 1]

Unnamed: 0,id,title,type,description,release_year,runtime,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,genre_0


In [6]:
titles = titles.drop(columns = ["description", "production_countries", "imdb_id", "imdb_votes", "tmdb_popularity", "id"])

In [7]:
titles = titles.dropna()
titles.head()

Unnamed: 0,title,type,release_year,runtime,seasons,imdb_score,genre_0
0,The General,MOVIE,1926,78,0.0,8.2,action
1,His Girl Friday,MOVIE,1940,92,0.0,7.8,comedy
2,Detour,MOVIE,1945,66,0.0,7.3,thriller
3,Marihuana,MOVIE,1936,57,0.0,4.0,crime
4,Intolerance: Love's Struggle Throughout the Ages,MOVIE,1916,197,0.0,7.7,history


In [8]:
titles.rename(columns={'genre_0': 'genre'}, inplace=True)

In [9]:
# Turning the type and genre into a categorical attributes
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(titles['type'])
titles['encoded_type'] = label_encoder.transform(titles['type'])

label_encoder.fit(titles['genre'])
titles['encoded_genre'] = label_encoder.transform(titles['genre'])

titles.head()

Unnamed: 0,title,type,release_year,runtime,seasons,imdb_score,genre,encoded_type,encoded_genre
0,The General,MOVIE,1926,78,0.0,8.2,action,0,1
1,His Girl Friday,MOVIE,1940,92,0.0,7.8,comedy,0,3
2,Detour,MOVIE,1945,66,0.0,7.3,thriller,0,16
3,Marihuana,MOVIE,1936,57,0.0,4.0,crime,0,4
4,Intolerance: Love's Struggle Throughout the Ages,MOVIE,1916,197,0.0,7.7,history,0,9


In [10]:
n = titles["encoded_genre"].unique().size

titles_genres = titles.drop(columns = ["title", "type", "release_year", "runtime", "seasons", "imdb_score", "encoded_type"])

In [11]:
for i in range(n):
    instances = titles_genres[titles_genres["encoded_genre"] == i]
    print("Genre: ", instances["genre"].unique(), ", Encoded genre: ", i)

Genre:  [''] , Encoded genre:  0
Genre:  ['action'] , Encoded genre:  1
Genre:  ['animation'] , Encoded genre:  2
Genre:  ['comedy'] , Encoded genre:  3
Genre:  ['crime'] , Encoded genre:  4
Genre:  ['documentation'] , Encoded genre:  5
Genre:  ['drama'] , Encoded genre:  6
Genre:  ['family'] , Encoded genre:  7
Genre:  ['fantasy'] , Encoded genre:  8
Genre:  ['history'] , Encoded genre:  9
Genre:  ['horror'] , Encoded genre:  10
Genre:  ['music'] , Encoded genre:  11
Genre:  ['reality'] , Encoded genre:  12
Genre:  ['romance'] , Encoded genre:  13
Genre:  ['scifi'] , Encoded genre:  14
Genre:  ['sport'] , Encoded genre:  15
Genre:  ['thriller'] , Encoded genre:  16
Genre:  ['war'] , Encoded genre:  17
Genre:  ['western'] , Encoded genre:  18


In [12]:
titles = titles[titles.encoded_genre != 0]
titles

Unnamed: 0,title,type,release_year,runtime,seasons,imdb_score,genre,encoded_type,encoded_genre
0,The General,MOVIE,1926,78,0.0,8.200000,action,0,1
1,His Girl Friday,MOVIE,1940,92,0.0,7.800000,comedy,0,3
2,Detour,MOVIE,1945,66,0.0,7.300000,thriller,0,16
3,Marihuana,MOVIE,1936,57,0.0,4.000000,crime,0,4
4,Intolerance: Love's Struggle Throughout the Ages,MOVIE,1916,197,0.0,7.700000,history,0,9
...,...,...,...,...,...,...,...,...,...
2652,Cher & the Loneliest Elephant,MOVIE,2021,46,0.0,7.600000,documentation,0,5
2653,Searching for Secrets,SHOW,2021,44,1.0,6.046369,documentation,1,5
2654,Destination Porto: The Unimaginable Journey,MOVIE,2021,85,0.0,7.700000,documentation,0,5
2655,House Calls with Dr. Phil,SHOW,2021,43,1.0,5.400000,reality,1,12


In [13]:
titles = titles.drop(columns = ["genre", "type", "title", "seasons", "runtime"])

In [14]:
titles.head()

Unnamed: 0,release_year,imdb_score,encoded_type,encoded_genre
0,1926,8.2,0,1
1,1940,7.8,0,3
2,1945,7.3,0,16
3,1936,4.0,0,4
4,1916,7.7,0,9


In [15]:
titles.to_csv('titles.csv', index=False)