In [1]:
import os
import pandas as pd

In [2]:
df = pd.read_csv(os.path.join("Resources", "wiki_movie_plots.csv"))

In [3]:
# Delete unnecessary columns
df.drop(['Unnamed: 8','Origin/Ethnicity','Director','Cast'], axis=1, inplace=True)

In [4]:
# Remove all rows with genre of 'unknown'
df = df[df['Genre'] != 'unknown']

In [5]:
# Initial genre replacements before splitting on separators
df['Genre'] = df['Genre'].replace({'science fiction':'science_fiction',
                                   'film noir':'crime'})

In [6]:
# Clean up genres - replace all possible separators with ' ', then split and return 1st word only
def reduce_genre(string):
    return str(string).replace(', ',' ').replace('-',' ').replace(' / ',' ').replace('/',' ').split(' ')[0]

In [7]:
# Apply function to all 'Genre' values
df['Genre'] = df['Genre'].apply(reduce_genre)

In [8]:
# Clean up genres - group subgenres into larger genres, manually
df['Genre'] = df['Genre'].replace({'martial':'action',
                                   'superhero':'action',
                                   'spy':'action',
                                   'animated':'animation',
                                   'anime':'animation',
                                   'biographical':'biography',
                                   'biopic':'biography',
                                   'mockumentary':'documentary',
                                   'historical':'drama',
                                   'melodrama':'drama',
                                   'sport':'drama',
                                   'suspense':'mystery',
                                   'romantic':'romance',
                                   'sci':'science_fiction',
                                   'science':'science_fiction'})

In [9]:
# Create new column for genre counts = total count of rows with that genre.
# Then drop all rows for genres with less than count of 300
df['genre_count'] = df.groupby(['Genre'])['Title'].transform('count')
df.drop(df[df['genre_count']<=300].index, inplace=True)

In [10]:
df['Genre'].value_counts()

drama              7139
comedy             5637
action             2303
romance            1944
crime              1623
horror             1391
thriller           1061
western             924
animation           839
science_fiction     786
musical             708
adventure           610
mystery             481
war                 408
family              402
fantasy             326
biography           323
Name: Genre, dtype: int64

In [11]:
df.to_csv(os.path.join("Resources", "wiki_movie_plots_CLEANED.csv"))