# Imports

In [1]:
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import wrangle as w
import explore as e
import modeling as m

# Acquire

* Data aquired from [Kaggle](https://www.kaggle.com/satpreetmakhija/netflix-movies-and-tv-shows-2021) on 2/04/2022
* Each observation represents on movie or film series 
* Because the distinction between a single movie an a multi-part series is not relevant to this study I will be referring to each as a movie for the duration of this study
* The original data set had 5967 rows and 13 columns

# Prepare

* Data was prepared using the fillowing steps:
    * Drop all columns other than 'description' and 'genre'
    * Lowercased text in both columns
    * Lowercased column names
    * Prepared text in 'description' for exploration using the following steps:
      * Removed all non-ascii and special characters form the text
      * Tokenized the words in the text
      * Lemmatized the words in the text
      * Removed stopwords
    * Prepared genre text using the following steps:
      * removed text indicating if a film is a standalone movie or series
      * created new column of curated list of genres       
      * The following genres were removed because they refeered to how the film was made or its intended audience rather than the theam of the film
           

In [2]:
df = w.get_my_movie_data()
df.head()

Unnamed: 0,description,genre
0,docuseries take deep dive lucrative wellness i...,reality
1,grisly virus rampage city lone man stay locked...,horror
2,diary anne frank story retold alongside five h...,documentaries
3,kenya barris family navigate relationship race...,comedies
4,pawesome documentary explores feline friend be...,documentaries


In [3]:
df.genre.value_counts()

dramas                         903
documentaries                  740
comedies                       656
action & adventure             451
romantic                       395
dramatic comedies              391
stand-up comedy                320
thrillers                      303
crime                          302
romantic comedies              270
horror                         253
music & musicals               181
reality                        161
action & adventure comedies    121
Name: genre, dtype: int64

In [4]:
df = df[['Description','Genres']]
df.head()

KeyError: "None of [Index(['Description', 'Genres'], dtype='object')] are in the [columns]"

In [None]:
# Lowercase all of the letters in both columns
df['Description'] = df['Description'].apply(lambda value: value.lower())
df['Genres'] = df['Genres'].apply(lambda value: value.lower())
df.head()

In [None]:
# rename columns
df.rename(columns={'Description':'description', 'Genres':'genre'}, inplace = True)
df.head()

In [None]:
# remove non-ascii characters from description text 
df['description'] = df['description'].apply(lambda value: unicodedata.normalize('NFKD', value)\
                                                                     .encode('ascii', 'ignore')\
                                                                     .decode('utf-8', 'ignore'))
df.head()

In [None]:
# remove special characters from description text
df['description'] = df['description'].apply(lambda value: re.sub(r"[^a-z0-9\s]", '', value))

df.head()

In [None]:
# create tokenizer object
tokenizer = nltk.tokenize.ToktokTokenizer()

# tokenize text in description
df['description'] = df['description'].apply(lambda value: tokenizer.tokenize(value, return_str=True))

df.head()

In [None]:


def lemmatizer(value):
    '''Takes in a value from a pandas column and returns the value lemmatized'''
    
    # create lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    
    # get list of lemmatized words in value
    value_lemmas = [wnl.lemmatize(word) for word in value.split()]
    
    # turn list or words back into a string and return value
    return ' '.join(value_lemmas)

In [None]:
# lemmatize the text in description
df['description'] = df['description'].apply(lambda value: lemmatizer(value))

df.head()

In [None]:
def remove_stopwords(value):
    
    # get list english language stopwords list from nlt
    stopword_list = stopwords.words('english')
    
    # split words in pandas value into a list and remove words from the list that are in stopwords
    value_words = value.split()
    filtered_list = [word for word in value_words if word not in stopword_list]
    
    # convert list back into string and return value
    return ' '.join(filtered_list)

In [None]:
# remove stopwords from text in description
df['description'] = df['description'].apply(lambda value: remove_stopwords(value))

df.head()

In [None]:
def remove_cinima_type(value):
    '''take in genre text from a pandas column
       remove text indicating standalone movie or series
       return remainder of the text'''
     
    value = value.replace(' tv','') 
    value = value.replace('tv ','')
    value = value.replace(' shows','')
    value = value.replace(' movies','')
    value = value.replace(' series','')
    value = value.replace(' features','')
    
    value = [genre.strip() for genre in value.split(',')]
    
    return ','.join(value)

In [None]:
df['genre'] = df['genre'].apply(lambda value: remove_cinima_type(value))

df.head()

In [None]:
df.info()

In [None]:
master_list = []

for text in set(df.genre.to_list()):
    
    value_genres = text.split(', ')
    
    for item in value_genres:
        
        master_list.append(item)
        
print(set(master_list), len(set(master_list)))

In [None]:
def get_genre_list(value):
    '''takes in string list of genes from a pandas column
       creates a python list of those genres 
       removes genres in the cut list 
       returns created list'''
    
    cut_list = ['international', 
                'teen', 
                'korean', 
                'anime', 
                'classic & cult', 
                "kids'", 
                'cult', 
                'movies', 
                'shows',
                'spanish-language', 
                'british', 
                'children & family', 
                'anime series',
                'lgbtq',
                'classic', 
                'international',
                'independent',
                'sci-fi & fantasy',
                'sports',
                'faith & spirituality',
                'stand-up comedy & talk',
                'mysteries']
    
    genre_list = value.split(',')
    genre_list = [genre.strip() for genre in genre_list]
    
    return [genre for genre in genre_list if genre not in cut_list]

In [None]:
df['genre_list'] = df['genre'].apply(lambda value: get_genre_list(value))

df.head()

In [None]:
# drop rows with empty lists looses 469 rows
print(df.shape)
df = df[df['genre_list'].map(lambda d: len(d)) > 0]
print(df.shape)

In [None]:
def merge_genres(value, merge_list, replacement):
    '''Take in a pandas value that is a list of genres
       a merge list and a replacement string
       If one of the genres in genre_list is in merge_list 
       return a list containing the replacement string
       otherwise return original list'''

    # builds list of genres from value matching merge_list
    check_list = [genre for genre in value if genre in merge_list]

    # if check_list is not empty return list with just documentary
    if len(check_list) > 0:
        
        return [replacement]
    
    # otherwise return original list
    else:
        
        return value

In [None]:
merge_list = ['docuseries', 'documentary', 'documentaries']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'documentaries'))

df.head()

In [None]:
merge_list = ['reality']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'reality'))

df.genre_list.value_counts().head(30)

In [None]:
merge_list = ['music & musicals']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'music & musicals'))

df.genre_list.value_counts().head(30)

In [None]:
merge_list = ['crime']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'crime'))

df.genre_list.value_counts().head(30)

In [None]:
def fuse_genre(value, fuse_list, replacement):
    
    # builds list of genres from value matching fuse_list
    check_list = [genre for genre in value if genre in fuse_list]

    # if the length of check_list is equal to the length of value return replacement 
    if (len(check_list) == len(value)) and (len(value) == 2):
        
        return [replacement]
    
    # otherwise return original list
    else:
        
        return value

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['romantic', 'comedies'], 'romantic comedies'))

df.genre_list.value_counts().head(30)

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['comedies', 'dramas'], 'dramatic comedies'))

df.genre_list.value_counts().head(30)

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['dramas', 'romantic'], 'romantic'))

df.genre_list.value_counts().head(30)

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['dramas', 'thrillers']  , 'thrillers'))

df.genre_list.value_counts().head(30)

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['action & adventure', 'dramas']  , 'thrillers'))

df.genre_list.value_counts().head(30)

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['dramas', 'horror']  , 'horror'))

df.genre_list.value_counts().head(30)

In [None]:
def remove_genre(value, genre, val_len):
    
    if (len(value) >= val_len) and (genre in value):
        
        value.remove(genre)
        
    return value

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: remove_genre(value,'dramas', 3))

df.genre_list.value_counts().head(30)

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['romantic', 'comedies'], 'romantic comedies'))

df.genre_list.value_counts().head(30)

In [None]:
merge_list = ['horror']
df['genre_list'] = df['genre_list'].apply(lambda value: merge_genres(value,merge_list,'horror'))

df.genre_list.value_counts().head(99)

In [None]:
df['genre_list'] = df['genre_list'].apply(lambda value: fuse_genre(value, ['action & adventure', 'comedies'], 'action & adventure comedies'))

df.genre_list.value_counts().head(99)

In [None]:
print(df.shape)
df = df[df['genre_list'].map(lambda d: len(d)) == 1]
print(df.shape)

In [None]:
df.head()

In [None]:
df['genre'] = df.genre_list.apply(lambda value: value[0])
df.head()

In [None]:
df = df.drop(columns = 'genre_list')

df.head()

In [None]:
set(df.genre.to_list())