In [103]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [44]:
movies = pd.read_csv("movie_data.csv")

Dataset column 'genres' is a string of what once appeared to be a list. genre_cleaner transforms this string into an actual list.

In [45]:
def genre_cleaner(row):
    genre_string = row['genres']
    genre_string = genre_string.replace(',','').replace('"','')
    #We lower just in case there is variability in capatalization convention
    genre_list = genre_string.strip("[]").lower().split()
    return genre_list

#Here we reassign genre column. We change the original dataset as there is no reason to keep old format
movies['genres'] = movies.apply(lambda row: genre_cleaner(row), axis = 1)

In [46]:
print(movies.isnull().sum())
print("...",len(movies))

id                        0
title                     0
release_date           2618
box_office_revenue    34617
runtime                6624
genres                    0
summary                   0
dtype: int64
... 42204


Here we see that we have an enormous amount of genres, many of which are for all intents and purposes the same genre just spelled slightly differently. E.g. - "Educational" and "Education", or "Children's" and "Children's/Family".

We must find a way to aggregate genres appropriately. Though it is important to note we will still have a large number of genres which I believe is justified.

In [92]:
genres_list = [genre for genres in movies['genres'] for genre in genres]

unique_genres = list(set(genres_list))
len(unique_genres)

386

We now explore the frequency of each genre and basic statistcs of the counts

In [118]:
genre_counts = Counter(genres_list)
genre_counts_bar = sum(genre_counts.values())/len(genre_counts)

sum_squares = 0
for genre, count in genre_counts.items():
    sum_squares += (count - genre_counts_bar)**2
std_dev = np.sqrt(sum_squares/len(genre_counts))

print("Standard Deviation: ", std_dev)
print("Mean: ", genre_counts_bar)
print("Max: ", max(genre_counts.values()))
print("Min: ", min(genre_counts.values()))

# genre_counts.most_common(100)

Standard Deviation:  2228.19194216
Mean:  560.6813471502591
Max:  24641
Min:  1


We will use unique_genres to create a mapping that will be used to combine like-genres into single categories.

In [83]:
fuzz.partial_ratio("family", "family-oriented")

100

In [119]:
genre_mapper = {genre : [] for genre in unique_genres}

for i in range(len(unique_genres)):
    for j in range(len(unique_genres)):
        if i != j:
            ratio_score = fuzz.ratio(unique_genres[i],unique_genres[j])
            partial_score = fuzz.partial_ratio(unique_genres[i],unique_genres[j])
            if ratio_score >= 85 or partial_score == 100:
                genre_mapper[unique_genres[i]].append(unique_genres[j])
genre_mapper

{'&': [],
 '95': [],
 '[feature': ['[feature]', 'feature'],
 '[feature]': ['[feature', 'feature'],
 '\\u00e0': [],
 'about': ['out'],
 'absurdism': [],
 'acid': [],
 'action': ['action/adventure'],
 'action/adventure': ['action', 'adventure'],
 'adaptation': [],
 'addiction': [],
 'adult': [],
 'adventure': ['action/adventure'],
 'age': ['marriage', 'tragedy', 'language', 'coming-of-age', 'backstage'],
 'airplanes': [],
 'airports': [],
 'albino': ['in'],
 'alien': [],
 'americana': [],
 'and': ['stand-up',
  'sandal',
  'candid',
  'propaganda',
  'black-and-white',
  'gland',
  'netherlands'],
 'animal': ['animals'],
 'animals': ['animal'],
 'animated': [],
 'animation': [],
 'anime': [],
 'anthology': ['anthropology'],
 'anthropology': ['anthology'],
 'anti-war': ['war'],
 'apocalyptic': ['post-apocalyptic'],
 'applied': [],
 'archaeology': [],
 'archives': [],
 'art': ['party', 'arts', 'cartoon', 'martial'],
 'arts': ['art'],
 'auto': [],
 'avant-garde': [],
 'b-movie': ['movie', '

In [72]:
movies['genres'][0]

['space',
 'western',
 'horror',
 'supernatural',
 'thriller',
 'science',
 'fiction',
 'action',
 'adventure']

In [60]:
genre_dict = {}
for i in range(len(movies)):
    current_movie = movies.iloc[i]
    title = current_movie['title']
    revenue = current_movie['box_office_revenue']
    summary = current_movie['summary']
    genres = current_movie['genres']
    
    for genre in genres:
        if len(genre_dict[genre]) < 1:
            genre_dict[genre] = []
            genre_dict[genre].append((title, revenue, summary))
        else:
            genre_dict[genre].append((title, revenue, summary))
    

KeyError: 'space'

In [47]:
movies

Unnamed: 0,id,title,release_date,box_office_revenue,runtime,genres,summary
0,0,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[space, western, horror, supernatural, thrille...","Set in the second half of the 22nd century, th..."
1,1,White Of The Eye,1987,,110.0,"[erotic, thriller, psychological, thriller, th...",A series of murders of rich young women throug...
2,2,A Woman in Flames,1983,,106.0,[drama],"Eva, an upper class housewife, becomes frustra..."
3,3,The Sorcerer's Apprentice,2002,,86.0,"[adventure, fantasy, world, cinema, family, film]","Every hundred years, the evil Morgana returns..."
4,4,Little city,1997-04-04,,93.0,"[romance, film, ensemble, film, comedy-drama, ...","Adam, a San Francisco-based artist who works a..."
5,5,Henry V,1989-11-08,10161099.0,137.0,"[epic, war, film, costume, drama, drama, perio...",{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...
6,6,Aaah Belinda,1986,,,[comedy],"Serap, a young actress with a strong, lively p..."
7,7,The Mechanical Monsters,,,,"[short, film, science, fiction, family, film, ...",The story starts as one of the robots flies i...
8,8,Mary Poppins,1964-08-27,102272727.0,139.0,"[children's/family, children's, fantasy, comed...",The film opens with Mary Poppins perched in a...
9,9,Die Fahne von Kriwoj Rog,1967,,108.0,[],"Otto Brosowski, a communist miner, writes to t..."


In [5]:
movies

Unnamed: 0,id,title,release_date,box_office_revenue,runtime,genres,summary
0,0,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[""Space western"", ""Horror"", ""Supernatural"", ""T...","Set in the second half of the 22nd century, th..."
1,1,White Of The Eye,1987,,110.0,"[""Erotic thriller"", ""Psychological thriller"", ...",A series of murders of rich young women throug...
2,2,A Woman in Flames,1983,,106.0,"[""Drama""]","Eva, an upper class housewife, becomes frustra..."
3,3,The Sorcerer's Apprentice,2002,,86.0,"[""Adventure"", ""Fantasy"", ""World cinema"", ""Fami...","Every hundred years, the evil Morgana returns..."
4,4,Little city,1997-04-04,,93.0,"[""Romance Film"", ""Ensemble Film"", ""Comedy-dram...","Adam, a San Francisco-based artist who works a..."
5,5,Henry V,1989-11-08,10161099.0,137.0,"[""Epic"", ""War film"", ""Costume drama"", ""Drama"",...",{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...
6,6,Aaah Belinda,1986,,,"[""Comedy""]","Serap, a young actress with a strong, lively p..."
7,7,The Mechanical Monsters,,,,"[""Short Film"", ""Science Fiction"", ""Family Film...",The story starts as one of the robots flies i...
8,8,Mary Poppins,1964-08-27,102272727.0,139.0,"[""Children's/Family"", ""Children's Fantasy"", ""C...",The film opens with Mary Poppins perched in a...
9,9,Die Fahne von Kriwoj Rog,1967,,108.0,[],"Otto Brosowski, a communist miner, writes to t..."
