# Association rules - preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

titles = pd.read_csv('../../preprocessing/titles_preprocessed.csv')
credits = pd.read_csv('../../preprocessing/credits_preprocessed.csv')

In [2]:
genres = titles['genres']
genres_split = genres.str.split(',')

titles["genres"] = titles.apply(lambda row: row["genres"].replace('[', '').replace(']', '').replace("'", "").split(", "), axis=1)
genre_df = pd.DataFrame(titles.genres.values.tolist(), titles.index).add_prefix('genre_')

titles = (titles.join(genre_df).drop(columns=["genres"]))



In [3]:
titles = titles.drop(columns=["genre_1", "genre_2", "genre_3", "genre_4", "genre_5", "genre_6", "genre_7", "genre_8"])

In [4]:
titles

Unnamed: 0,id,title,type,description,release_year,runtime,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,genre_0
0,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,78,['US'],0.0,tt0017925,8.200000,89766.000000,8.647,action
1,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,92,['US'],0.0,tt0032599,7.800000,57835.000000,11.270,comedy
2,tm19424,Detour,MOVIE,"The life of Al Roberts, a pianist in a New Yor...",1945,66,['US'],0.0,tt0037638,7.300000,17233.000000,7.757,thriller
3,tm112005,Marihuana,MOVIE,A young girl named Burma attends a beach party...,1936,57,['US'],0.0,tt0026683,4.000000,864.000000,3.748,crime
4,tm22806,Intolerance: Love's Struggle Throughout the Ages,MOVIE,"The story of a poor young woman, separated by ...",1916,197,['US'],0.0,tt0006864,7.700000,15242.000000,9.412,history
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2652,tm975981,Cher & the Loneliest Elephant,MOVIE,"""The World's Loneliest Elephant"" Kaavan will f...",2021,46,"['KH', 'PK', 'GB', 'US']",0.0,tt14303400,7.600000,237.000000,1.758,documentation
2653,ts296262,Searching for Secrets,SHOW,Journey back in time to uncover buried tales i...,2021,44,"['GB', 'US']",1.0,tt18259488,6.046369,22272.758145,0.600,documentation
2654,tm1106415,Destination Porto: The Unimaginable Journey,MOVIE,Follow soccer journalist Guillem Belagué as he...,2021,85,[],0.0,tt15430722,7.700000,8.000000,0.600,documentation
2655,ts305329,House Calls with Dr. Phil,SHOW,The talk show legend travels across the countr...,2021,43,['US'],1.0,tt15176234,5.400000,35.000000,0.600,reality


In [5]:
titles[titles["genre_0"].isna() == 1]

Unnamed: 0,id,title,type,description,release_year,runtime,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,genre_0


In [6]:
merged_df = pd.merge(credits, titles, on='id', how='left')

merged_df.head()

Unnamed: 0,person_id,id,name,character,role,title,type,description,release_year,runtime,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,genre_0
0,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926.0,78.0,['US'],0.0,tt0017925,8.2,89766.0,8.647,action
1,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926.0,78.0,['US'],0.0,tt0017925,8.2,89766.0,8.647,action
2,28714,tm19248,Glen Cavender,Captain Anderson,ACTOR,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926.0,78.0,['US'],0.0,tt0017925,8.2,89766.0,8.647,action
3,28715,tm19248,Jim Farley,General Thatcher,ACTOR,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926.0,78.0,['US'],0.0,tt0017925,8.2,89766.0,8.647,action
4,27348,tm19248,Frederick Vroom,A Southern General,ACTOR,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926.0,78.0,['US'],0.0,tt0017925,8.2,89766.0,8.647,action


In [7]:
merged_df = merged_df.drop(columns = ["description", "production_countries", "imdb_id", "imdb_votes", "tmdb_popularity", "runtime", "seasons", "id", "person_id", "release_year", "character", "role", "title"])

In [8]:
merged_df = merged_df.dropna()
merged_df.head()

Unnamed: 0,name,type,imdb_score,genre_0
0,Buster Keaton,MOVIE,8.2,action
1,Marion Mack,MOVIE,8.2,action
2,Glen Cavender,MOVIE,8.2,action
3,Jim Farley,MOVIE,8.2,action
4,Frederick Vroom,MOVIE,8.2,action


In [10]:
merged_df.rename(columns={'genre_0': 'genre'}, inplace=True)

In [12]:
# Turning the type and genre into a categorical attributes
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(merged_df['type'])
merged_df['encoded_type'] = label_encoder.transform(merged_df['type'])

label_encoder.fit(merged_df['genre'])
merged_df['encoded_genre'] = label_encoder.transform(merged_df['genre'])

merged_df.head()

Unnamed: 0,name,type,imdb_score,genre,encoded_type,encoded_genre
0,Buster Keaton,MOVIE,8.2,action,0,1
1,Marion Mack,MOVIE,8.2,action,0,1
2,Glen Cavender,MOVIE,8.2,action,0,1
3,Jim Farley,MOVIE,8.2,action,0,1
4,Frederick Vroom,MOVIE,8.2,action,0,1


In [14]:
merged_df.to_csv('merged.csv', index=False)