# Imports

In [1]:
import pandas as pd
import re

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import wrangle as w
import explore as e
import modeling as m

# Acquire

* Data aquired from [Kaggle](https://www.kaggle.com/satpreetmakhija/netflix-movies-and-tv-shows-2021) on 2/04/2022
* Each observation represents on movie or film series 
* Because the distinction between a single movie an a multi-part series is not relevant to this study I will be referring to each as a movie for the duration of this study
* The original data set had 5967 rows and 13 columns

In [2]:
# read the csv into pandas
df = pd.read_csv('netflixdata.csv')
df.shape

(5967, 13)

# Prepare

In [None]:
df = prep_movie_info(df)
df.head()

In [3]:
df.columns

Index(['Show Id', 'Title', 'Description', 'Director', 'Genres', 'Cast',
       'Production Country', 'Release Date', 'Rating', 'Duration',
       'Imdb Score', 'Content Type', 'Date Added'],
      dtype='object')

In [4]:
df = df[['Description','Genres']]
df.head()

Unnamed: 0,Description,Genres
0,This docuseries takes a deep dive into the luc...,Reality TV
1,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers"
2,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies"
3,Kenya Barris and his family navigate relations...,TV Comedies
4,This pawesome documentary explores how our fel...,"Documentaries, International Movies"


In [5]:
# Lowercase all of the letters in both columns
df['Description'] = df['Description'].apply(lambda value: value.lower())
df['Genres'] = df['Genres'].apply(lambda value: value.lower())
df.head()

Unnamed: 0,Description,Genres
0,this docuseries takes a deep dive into the luc...,reality tv
1,"as a grisly virus rampages a city, a lone man ...","horror movies, international movies, thrillers"
2,"through her diary, anne frank's story is retol...","documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [6]:
# rename columns
df.rename(columns={'Description':'description', 'Genres':'genre'}, inplace = True)
df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,"as a grisly virus rampages a city, a lone man ...","horror movies, international movies, thrillers"
2,"through her diary, anne frank's story is retol...","documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [7]:
# remove non-ascii characters from description text 
df['description'] = df['description'].apply(lambda value: unicodedata.normalize('NFKD', value)\
                                                                     .encode('ascii', 'ignore')\
                                                                     .decode('utf-8', 'ignore'))
df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,"as a grisly virus rampages a city, a lone man ...","horror movies, international movies, thrillers"
2,"through her diary, anne frank's story is retol...","documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [8]:
# remove special characters from description text
df['description'] = df['description'].apply(lambda value: re.sub(r"[^a-z0-9\s]", '', value))

df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,as a grisly virus rampages a city a lone man s...,"horror movies, international movies, thrillers"
2,through her diary anne franks story is retold ...,"documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [9]:
# create tokenizer object
tokenizer = nltk.tokenize.ToktokTokenizer()

# tokenize text in description
df['description'] = df['description'].apply(lambda value: tokenizer.tokenize(value, return_str=True))

df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,as a grisly virus rampages a city a lone man s...,"horror movies, international movies, thrillers"
2,through her diary anne franks story is retold ...,"documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [10]:
wnl = nltk.stem.WordNetLemmatizer()

def lemmatizer(value):
    '''Takes in a value from a pandas column and returns the value lemmatized'''
    
    # get list of lemmatized words in value
    value_lemmas = [wnl.lemmatize(word) for word in value.split()]
    
    # turn list or words back into a string and return value
    return ' '.join(value_lemmas)

In [11]:
# lemmatize the text in description
df['description'] = df['description'].apply(lambda value: lemmatizer(value))

df.head()

Unnamed: 0,description,genre
0,this docuseries take a deep dive into the lucr...,reality tv
1,a a grisly virus rampage a city a lone man sta...,"horror movies, international movies, thrillers"
2,through her diary anne frank story is retold a...,"documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [12]:
def remove_stopwords(value):
    
    # get list english language stopwords list from nlt
    stopword_list = stopwords.words('english')
    
    # split words in pandas value into a list and remove words from the list that are in stopwords
    value_words = value.split()
    filtered_list = [word for word in value_words if word not in stopword_list]
    
    # convert list back into string and return value
    return ' '.join(filtered_list)

In [13]:
# remove stopwords from text in description
df['description'] = df['description'].apply(lambda value: remove_stopwords(value))

df.head()

Unnamed: 0,description,genre
0,docuseries take deep dive lucrative wellness i...,reality tv
1,grisly virus rampage city lone man stay locked...,"horror movies, international movies, thrillers"
2,diary anne frank story retold alongside five h...,"documentaries, international movies"
3,kenya barris family navigate relationship race...,tv comedies
4,pawesome documentary explores feline friend be...,"documentaries, international movies"


In [14]:
def remove_cinima_type(value):
    '''Take in genre text from a pandas column
       Remove text indicating standalone movie or series
       return remainder of the text'''
     
    value = value.replace(' tv','') 
    value = value.replace('tv ','')
    value = value.replace(' shows','')
    value = value.replace(' movies','')
    value = value.replace(' series','')
    value = value.replace(' features','')
    
    value = [genre.strip() for genre in value.split(',')]
    
    return ','.join(value)

In [15]:
df['genre'] = df['genre'].apply(lambda value: remove_cinima_type(value))

df.head()

Unnamed: 0,description,genre
0,docuseries take deep dive lucrative wellness i...,reality
1,grisly virus rampage city lone man stay locked...,"horror,international,thrillers"
2,diary anne frank story retold alongside five h...,"documentaries,international"
3,kenya barris family navigate relationship race...,comedies
4,pawesome documentary explores feline friend be...,"documentaries,international"


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5967 entries, 0 to 5966
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  5967 non-null   object
 1   genre        5967 non-null   object
dtypes: object(2)
memory usage: 93.4+ KB


In [17]:
master_list = []

for text in set(df.genre.to_list()):
    
    value_genres = text.split(', ')
    
    for item in value_genres:
        
        master_list.append(item)
        
print(set(master_list), len(set(master_list)))

{'crime,docuseries,mysteries', 'stand-up comedy & talk,comedies', 'dramas,sci-fi & fantasy,teen', 'dramas,teen', 'dramas,lgbtq', 'comedies,international', 'horror,independent', 'international,korean,dramas', 'children & family', 'action & adventure,comedies,sports', 'children & family,comedies,sports', 'anime,documentaries', 'international,korean,stand-up comedy & talk', "british,kids',comedies", 'crime,action & adventure,thrillers', 'action & adventure,anime,horror', 'horror,lgbtq,music & musicals', 'international,thrillers', 'children & family,dramas,sports', 'comedies,romantic', 'comedies,music & musicals,sports', 'comedies,music & musicals', 'comedies,international,sci-fi & fantasy', 'children & family,comedies,music & musicals', 'crime,action & adventure', 'dramas,sci-fi & fantasy', 'action & adventure,dramas,independent', 'comedies,dramas,lgbtq', 'dramas,faith & spirituality,romantic', 'children & family,dramas,independent', "crime,kids'", 'dramas,faith & spirituality,internation

In [18]:
def get_genre_list(value):
    '''takes in string list of genes from a pandas column
       creates a python list of those genres
       returns a python list of those geres that are not in the cut list'''
    
    cut_list = ['international', 
                'teen', 
                'korean', 
                'anime', 
                'classic & cult', 
                "kids'", 
                'cult', 
                'movies', 
                'shows',
                'spanish-language', 
                'british', 
                'children & family', 
                'anime series',
                'lgbtq',
                'classic', 
                'international',
                'independent',
                'sci-fi & fantasy',
                'sports',
                'faith & spirituality',
                'stand-up comedy & talk',
                'mysteries']
    
    genre_list = value.split(',')
    genre_list = [genre.strip() for genre in genre_list]
    
    return [genre for genre in genre_list if genre not in cut_list]

In [19]:
df['genre_list'] = df['genre'].apply(lambda value: get_genre_list(value))

df.head()

Unnamed: 0,description,genre,genre_list
0,docuseries take deep dive lucrative wellness i...,reality,[reality]
1,grisly virus rampage city lone man stay locked...,"horror,international,thrillers","[horror, thrillers]"
2,diary anne frank story retold alongside five h...,"documentaries,international",[documentaries]
3,kenya barris family navigate relationship race...,comedies,[comedies]
4,pawesome documentary explores feline friend be...,"documentaries,international",[documentaries]


In [20]:
# drop rows with empty lists looses 469 rows
print(df.shape)
df = df[df['genre_list'].map(lambda d: len(d)) > 0]
print(df.shape)

(5967, 3)
(5472, 3)


In [21]:
def merge_genres(value, merge_list, replacement):
    '''Take in a pandas value that is a list of genres
       a merge list and a replacement string
       If one of the genres is in merge_list 
       return list containing replacement string
       otherwise return original list'''

    # builds list of genres from value matching merge_list
    check_list = [genre for genre in value if genre in merge_list]

    # if check_list is not empty return list with just documentary
    if len(check_list) > 0:
        
        return [replacement]
    
    # otherwise return original list
    else:
        
        return value

In [22]:
merge_list = ['docuseries', 'documentary', 'documentaries']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'documentaries'))

df.head()

Unnamed: 0,description,genre,genre_list
0,docuseries take deep dive lucrative wellness i...,reality,[reality]
1,grisly virus rampage city lone man stay locked...,"horror,international,thrillers","[horror, thrillers]"
2,diary anne frank story retold alongside five h...,"documentaries,international",[documentaries]
3,kenya barris family navigate relationship race...,comedies,[comedies]
4,pawesome documentary explores feline friend be...,"documentaries,international",[documentaries]


In [23]:
merge_list = ['reality']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'reality'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                  903
[documentaries]                           740
[comedies]                                656
[comedies, dramas]                        391
[stand-up comedy]                         320
[action & adventure]                      286
[dramas, romantic]                        198
[dramas, thrillers]                       180
[comedies, romantic]                      169
[action & adventure, dramas]              165
[reality]                                 161
[crime, dramas]                           128
[thrillers]                               123
[romantic]                                117
[horror]                                  113
[crime]                                   109
[action & adventure, comedies]             99
[romantic, comedies]                       81
[romantic, dramas]                         80
[horror, thrillers]                        65
[comedies, music & musicals]               63
[dramas, music & musicals]        

In [24]:
merge_list = ['music & musicals']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'music & musicals'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                  903
[documentaries]                           740
[comedies]                                656
[comedies, dramas]                        391
[stand-up comedy]                         320
[action & adventure]                      286
[dramas, romantic]                        198
[music & musicals]                        181
[dramas, thrillers]                       180
[comedies, romantic]                      169
[action & adventure, dramas]              165
[reality]                                 161
[crime, dramas]                           128
[thrillers]                               123
[romantic]                                117
[horror]                                  113
[crime]                                   109
[action & adventure, comedies]             99
[romantic, comedies]                       81
[romantic, dramas]                         80
[horror, thrillers]                        65
[comedies, horror]                

In [25]:
merge_list = ['crime']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'crime'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                  903
[documentaries]                           740
[comedies]                                656
[comedies, dramas]                        391
[stand-up comedy]                         320
[crime]                                   302
[action & adventure]                      286
[dramas, romantic]                        198
[music & musicals]                        181
[dramas, thrillers]                       180
[comedies, romantic]                      169
[action & adventure, dramas]              165
[reality]                                 161
[thrillers]                               123
[romantic]                                117
[horror]                                  113
[action & adventure, comedies]             99
[romantic, comedies]                       81
[romantic, dramas]                         80
[horror, thrillers]                        65
[comedies, horror]                         27
[action & adventure, comedies, dra

In [26]:
def fuse_genre(value, fuse_list, replacement):
    
    # builds list of genres from value matching fuse_list
    check_list = [genre for genre in value if genre in fuse_list]

    # if the length of check_list is equal to the length of value return replacement 
    if (len(check_list) == len(value)) and (len(value) == 2):
        
        return [replacement]
    
    # otherwise return original list
    else:
        
        return value

In [27]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['romantic', 'comedies'], 'romantic comedies'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                  903
[documentaries]                           740
[comedies]                                656
[comedies, dramas]                        391
[stand-up comedy]                         320
[crime]                                   302
[action & adventure]                      286
[romantic comedies]                       250
[dramas, romantic]                        198
[music & musicals]                        181
[dramas, thrillers]                       180
[action & adventure, dramas]              165
[reality]                                 161
[thrillers]                               123
[romantic]                                117
[horror]                                  113
[action & adventure, comedies]             99
[romantic, dramas]                         80
[horror, thrillers]                        65
[comedies, horror]                         27
[action & adventure, comedies, dramas]     22
[dramas, horror]                  

In [28]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['comedies', 'dramas'], 'dramatic comedies'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                  903
[documentaries]                           740
[comedies]                                656
[dramatic comedies]                       391
[stand-up comedy]                         320
[crime]                                   302
[action & adventure]                      286
[romantic comedies]                       250
[dramas, romantic]                        198
[music & musicals]                        181
[dramas, thrillers]                       180
[action & adventure, dramas]              165
[reality]                                 161
[thrillers]                               123
[romantic]                                117
[horror]                                  113
[action & adventure, comedies]             99
[romantic, dramas]                         80
[horror, thrillers]                        65
[comedies, horror]                         27
[action & adventure, comedies, dramas]     22
[dramas, horror]                  

In [29]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['dramas', 'romantic'], 'romantic'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                  903
[documentaries]                           740
[comedies]                                656
[romantic]                                395
[dramatic comedies]                       391
[stand-up comedy]                         320
[crime]                                   302
[action & adventure]                      286
[romantic comedies]                       250
[music & musicals]                        181
[dramas, thrillers]                       180
[action & adventure, dramas]              165
[reality]                                 161
[thrillers]                               123
[horror]                                  113
[action & adventure, comedies]             99
[horror, thrillers]                        65
[comedies, horror]                         27
[action & adventure, comedies, dramas]     22
[dramas, horror]                           21
[action & adventure, horror]               16
[romantic, comedies, dramas]      

In [30]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['dramas', 'thrillers']  , 'thrillers'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                    903
[documentaries]                             740
[comedies]                                  656
[romantic]                                  395
[dramatic comedies]                         391
[stand-up comedy]                           320
[thrillers]                                 303
[crime]                                     302
[action & adventure]                        286
[romantic comedies]                         250
[music & musicals]                          181
[action & adventure, dramas]                165
[reality]                                   161
[horror]                                    113
[action & adventure, comedies]               99
[horror, thrillers]                          65
[comedies, horror]                           27
[action & adventure, comedies, dramas]       22
[dramas, horror]                             21
[action & adventure, horror]                 16
[romantic, comedies, dramas]            

In [31]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['action & adventure', 'dramas']  , 'thrillers'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                  903
[documentaries]                           740
[comedies]                                656
[thrillers]                               468
[romantic]                                395
[dramatic comedies]                       391
[stand-up comedy]                         320
[crime]                                   302
[action & adventure]                      286
[romantic comedies]                       250
[music & musicals]                        181
[reality]                                 161
[horror]                                  113
[action & adventure, comedies]             99
[horror, thrillers]                        65
[comedies, horror]                         27
[action & adventure, comedies, dramas]     22
[dramas, horror]                           21
[action & adventure, horror]               16
[romantic, comedies, dramas]               12
[comedies, dramas, romantic]                8
[action & adventure, romantic]    

In [32]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['dramas', 'horror']  , 'horror'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                    903
[documentaries]                             740
[comedies]                                  656
[thrillers]                                 468
[romantic]                                  395
[dramatic comedies]                         391
[stand-up comedy]                           320
[crime]                                     302
[action & adventure]                        286
[romantic comedies]                         250
[music & musicals]                          181
[reality]                                   161
[horror]                                    134
[action & adventure, comedies]               99
[horror, thrillers]                          65
[comedies, horror]                           27
[action & adventure, comedies, dramas]       22
[action & adventure, horror]                 16
[romantic, comedies, dramas]                 12
[comedies, dramas, romantic]                  8
[action & adventure, romantic]          

In [33]:
def remove_genre(value, genre, val_len):
    
    if (len(value) >= val_len) and (genre in value):
        
        value.remove(genre)
        
    return value

In [34]:
df['genre_list'] = df['genre_list'].apply(lambda value: remove_genre(value,'dramas', 3))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                    903
[documentaries]                             740
[comedies]                                  656
[thrillers]                                 468
[romantic]                                  395
[dramatic comedies]                         391
[stand-up comedy]                           320
[crime]                                     302
[action & adventure]                        286
[romantic comedies]                         250
[music & musicals]                          181
[reality]                                   161
[horror]                                    134
[action & adventure, comedies]              121
[horror, thrillers]                          65
[comedies, horror]                           28
[action & adventure, horror]                 19
[romantic, comedies]                         12
[action & adventure, romantic]                8
[comedies, romantic]                          8
[romantic, thrillers]                   

In [35]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['romantic', 'comedies'], 'romantic comedies'))

df.genre_list.value_counts().head(30)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                    903
[documentaries]                             740
[comedies]                                  656
[thrillers]                                 468
[romantic]                                  395
[dramatic comedies]                         391
[stand-up comedy]                           320
[crime]                                     302
[action & adventure]                        286
[romantic comedies]                         270
[music & musicals]                          181
[reality]                                   161
[horror]                                    134
[action & adventure, comedies]              121
[horror, thrillers]                          65
[comedies, horror]                           28
[action & adventure, horror]                 19
[action & adventure, romantic]                8
[romantic, action & adventure]                4
[comedies, thrillers]                         4
[romantic, thrillers]                   

In [36]:
merge_list = ['horror']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'horror'))

df.genre_list.value_counts().head(99)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                    903
[documentaries]                             740
[comedies]                                  656
[thrillers]                                 468
[romantic]                                  395
[dramatic comedies]                         391
[stand-up comedy]                           320
[crime]                                     302
[action & adventure]                        286
[romantic comedies]                         270
[horror]                                    253
[music & musicals]                          181
[reality]                                   161
[action & adventure, comedies]              121
[action & adventure, romantic]                8
[romantic, thrillers]                         4
[romantic, action & adventure]                4
[comedies, thrillers]                         4
[action & adventure, thrillers]               3
[action & adventure, comedies, romantic]      2
Name: genre_list, dtype: int64

In [37]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['action & adventure', 'comedies'], 'action & adventure comedies'))

df.genre_list.value_counts().head(99)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[dramas]                                    903
[documentaries]                             740
[comedies]                                  656
[thrillers]                                 468
[romantic]                                  395
[dramatic comedies]                         391
[stand-up comedy]                           320
[crime]                                     302
[action & adventure]                        286
[romantic comedies]                         270
[horror]                                    253
[music & musicals]                          181
[reality]                                   161
[action & adventure comedies]               121
[action & adventure, romantic]                8
[romantic, thrillers]                         4
[romantic, action & adventure]                4
[comedies, thrillers]                         4
[action & adventure, thrillers]               3
[action & adventure, comedies, romantic]      2
Name: genre_list, dtype: int64

In [38]:
print(df.shape)
df = df[df['genre_list'].map(lambda d: len(d)) == 1]
print(df.shape)

(5472, 3)
(5447, 3)


In [40]:
df.head()

Unnamed: 0,description,genre,genre_list
0,docuseries take deep dive lucrative wellness i...,reality,[reality]
1,grisly virus rampage city lone man stay locked...,"horror,international,thrillers",[horror]
2,diary anne frank story retold alongside five h...,"documentaries,international",[documentaries]
3,kenya barris family navigate relationship race...,comedies,[comedies]
4,pawesome documentary explores feline friend be...,"documentaries,international",[documentaries]


In [42]:
df['genre'] = df.genre_list.apply(lambda value: value[0])
df.head()

Unnamed: 0,description,genre,genre_list
0,docuseries take deep dive lucrative wellness i...,reality,[reality]
1,grisly virus rampage city lone man stay locked...,horror,[horror]
2,diary anne frank story retold alongside five h...,documentaries,[documentaries]
3,kenya barris family navigate relationship race...,comedies,[comedies]
4,pawesome documentary explores feline friend be...,documentaries,[documentaries]


In [43]:
df = df.drop(columns = 'genre_list')

df.head()

Unnamed: 0,description,genre
0,docuseries take deep dive lucrative wellness i...,reality
1,grisly virus rampage city lone man stay locked...,horror
2,diary anne frank story retold alongside five h...,documentaries
3,kenya barris family navigate relationship race...,comedies
4,pawesome documentary explores feline friend be...,documentaries


In [44]:
set(df.genre.to_list())

{'action & adventure',
 'action & adventure comedies',
 'comedies',
 'crime',
 'documentaries',
 'dramas',
 'dramatic comedies',
 'horror',
 'music & musicals',
 'reality',
 'romantic',
 'romantic comedies',
 'stand-up comedy',
 'thrillers'}