In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

import sklearn.preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

import wrangle as w

In [2]:
df = w.get_show_data()

df.sample(10)

Unnamed: 0,description,genres,war,music,history,european,horror,romance,reality,scifi,...,documentation,drama,family,animation,western,action,thriller,crime,fantasy,sport
5545,partner swap experiment take turn four friend ...,"[romance, drama, comedy]",False,False,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,False,False
1914,sister follows story three woman discover sist...,"[drama, comedy]",False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3483,city girl gabriela spontaneously enters contes...,"[romance, comedy]",False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
124,following death father young danny madigan tak...,"[action, fantasy, comedy, family]",False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,True,False
1194,rebellious mickey goodnatured gu navigate thri...,"[comedy, drama, romance]",False,False,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,False,False
4250,body powerful businesswoman go missing morgue ...,"[thriller, comedy]",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
223,liz lemon head writer latenight tv variety sho...,"[comedy, sport]",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
844,year future earth ha abandoned due radioactivi...,"[scifi, drama, action]",False,False,False,False,False,False,False,True,...,False,True,False,False,False,True,False,False,False,False
5346,georgia town football rule winning paramount h...,"[reality, sport]",False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
1496,acp yashvardhan team raw agent kk bring master...,"[action, crime, thriller]",False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,True,False,False


In [3]:
def str_to_list(df):
    
    puncs = ['[',']',"'",' ']

    for punc in puncs:

        df['genres'] = df['genres'].str.replace(punc,'')

    df['genres'] = df['genres'].str.split(',')
    
    return df

In [4]:
def get_genre_columns(df):
    
    # get set of unique genres
    gens = df[['genres']].explode('genres')

    gen_set = set(gens.genres.to_list())

    # for each unique genre add a column genre_name displaying if the show is in that genre
    for gen in gen_set:

        df[f'{gen}'] = df['genres'].apply(lambda gen_list: gen in gen_list)
        
    return df 

In [5]:
def prep_description(df):
    ''' Prepare film description text for exploration'''

    # remove special characters from description text
    df['description'] = df['description'].apply(lambda value: str(value).lower())
    
    df['description'] = df['description'].apply(lambda value: re.sub(r'[^\w\s]|[\d]', '', value))
    
    # remove non-ascii characters from description text 
    df['description'] = df['description'].apply(lambda value: unicodedata.normalize('NFKD', value)
                                                                         .encode('ascii', 'ignore')
                                                                         .decode('utf-8', 'ignore'))
    # tokenizes text in description
    df = get_disc_tokens(df)

    # lemmatize the text in description
    df['description'] = df['description'].apply(lambda value: lemmatizer(value))

    # remove stopwords from text in description and return a list of words in the text
    df['description'] = df['description'].apply(lambda value: remove_stopwords(value))

    return df

In [6]:
def get_disc_tokens(df):
    
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # tokenize text in description
    df['description'] = df['description'].apply(lambda value: tokenizer.tokenize(value, return_str=True))
    
    return df

In [7]:
def get_disc_tokens(df):
    
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # tokenize text in description
    df['description'] = df['description'].apply(lambda value: tokenizer.tokenize(value, return_str=True))
    
    return df


def lemmatizer(value):
    '''Takes in a value from a pandas column and returns the value lemmatized'''
    
    # create lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    
    # get list of lemmatized words in value
    value_lemmas = [wnl.lemmatize(word) for word in value.split()]
    
    # turn list or words back into a string and return value
    return ' '.join(value_lemmas)


def remove_stopwords(value):
    ''' remove stopwords from text'''

    # get list english language stopwords list from nlt
    stopword_list = stopwords.words('english')
    
    # split words in pandas value into a list and remove words from the list that are in stopwords
    value_words = value.split()
    filtered_list = [word for word in value_words if word not in stopword_list]
    
    # convert list back into string and return value
    return ' '.join(filtered_list)

In [8]:
def get_show_data():
    
    # get dataframe of descriptions and genres 
    df = pd.read_csv('titles.csv')

    df = df[['description', 'genres']]

    # drop rows with empty genres
    df = df[df.genres != '[]']

    # convert strings in genres to actual lists
    df = str_to_list(df)
    
    # for each unique genre add a column genre_name displaying if the show is in that genre
    df = get_genre_columns(df)
    
    # clean description column
    df = prep_description(df)
    
    return df

In [9]:
df = w.get_show_data()



df.sample(10)
    


Unnamed: 0,description,genres,war,music,history,european,horror,romance,reality,scifi,...,documentation,drama,family,animation,western,action,thriller,crime,fantasy,sport
232,murder espionage terrorism stolen submarine te...,"[crime, action, thriller, drama, comedy]",False,False,False,False,False,False,False,False,...,False,True,False,False,False,True,True,True,False,False
4215,rich nasty woman return hometown evict everyon...,"[comedy, music, family]",False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2560,group people meet party form four different re...,"[comedy, romance]",False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
5666,adaption taiwanese comic series brave animated...,"[action, animation]",False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
5265,comedy special jon stewart john mulaney chelse...,"[documentation, comedy]",False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2800,bright field prep mistletoe ball broken orname...,"[drama, comedy, family]",False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,False,False,False
4655,good bandit introduces new narcocomedy genre f...,"[comedy, crime, drama, family]",False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,False,False
5307,poland satisfied result murder investigation y...,"[crime, drama, thriller]",False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,True,False,False
977,feature film birth reborn portrays serious obs...,"[documentation, family, history]",False,False,True,False,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
4628,revisiting life goal set letter written teen f...,[comedy],False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
import re

text = "This, is an ex,ampl,e str,ing with $p,ecial  , chara,,,cters a,nd spaces."

clean_text = re.sub(r'[^\w\s]', "", text)

print(clean_text)

This is an example string with pecial   characters and spaces
