# Imports

In [1]:
import pandas as pd
import re

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import wrangle
import explore
import modeling

# Acquire

* Data aquired from [Kaggle](https://www.kaggle.com/satpreetmakhija/netflix-movies-and-tv-shows-2021) on 2/04/2022
* Each observation represents on movie or film series 
* Because the distinction between a single movie an a multi-part series is not relevant to this study I will be referring to each as a movie for the duration of this study
* The original data set had 5967 rows and 13 columns

In [2]:
# read the csv into pandas
df = pd.read_csv('netflixdata.csv')
df.shape

(5967, 13)

# Prepare

In [3]:
df.columns

Index(['Show Id', 'Title', 'Description', 'Director', 'Genres', 'Cast',
       'Production Country', 'Release Date', 'Rating', 'Duration',
       'Imdb Score', 'Content Type', 'Date Added'],
      dtype='object')

In [4]:
df = df[['Description','Genres']]
df.head()

Unnamed: 0,Description,Genres
0,This docuseries takes a deep dive into the luc...,Reality TV
1,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers"
2,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies"
3,Kenya Barris and his family navigate relations...,TV Comedies
4,This pawesome documentary explores how our fel...,"Documentaries, International Movies"


In [5]:
# Lowercase all of the letters in both columns
df['Description'] = df['Description'].apply(lambda value: value.lower())
df['Genres'] = df['Genres'].apply(lambda value: value.lower())
df.head()

Unnamed: 0,Description,Genres
0,this docuseries takes a deep dive into the luc...,reality tv
1,"as a grisly virus rampages a city, a lone man ...","horror movies, international movies, thrillers"
2,"through her diary, anne frank's story is retol...","documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [6]:
# rename columns
df.rename(columns={'Description':'description', 'Genres':'genre'}, inplace = True)
df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,"as a grisly virus rampages a city, a lone man ...","horror movies, international movies, thrillers"
2,"through her diary, anne frank's story is retol...","documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [7]:
# remove non-ascii characters from description text 
df['description'] = df['description'].apply(lambda value: unicodedata.normalize('NFKD', value)\
                                                                     .encode('ascii', 'ignore')\
                                                                     .decode('utf-8', 'ignore'))
df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,"as a grisly virus rampages a city, a lone man ...","horror movies, international movies, thrillers"
2,"through her diary, anne frank's story is retol...","documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [8]:
# remove special characters from description text
df['description'] = df['description'].apply(lambda value: re.sub(r"[^a-z0-9'\s]", '', value))

df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,as a grisly virus rampages a city a lone man s...,"horror movies, international movies, thrillers"
2,through her diary anne frank's story is retold...,"documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [9]:
# create tokenizer object
tokenizer = nltk.tokenize.ToktokTokenizer()

# tokenize text in description
df['description'] = df['description'].apply(lambda value: tokenizer.tokenize(value, return_str=True))

df.head()

Unnamed: 0,description,genre
0,this docuseries takes a deep dive into the luc...,reality tv
1,as a grisly virus rampages a city a lone man s...,"horror movies, international movies, thrillers"
2,through her diary anne frank ' s story is reto...,"documentaries, international movies"
3,kenya barris and his family navigate relations...,tv comedies
4,this pawesome documentary explores how our fel...,"documentaries, international movies"


In [10]:
wnl = nltk.stem.WordNetLemmatizer()


for word in 'study studies'.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))

NameError: name 'original' is not defined