# Netflix Movies and TV Shows Data Analysis

### Importing Libraries and Loading Data

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer


In [136]:
# Load the data
data = pd.read_csv('data/netflix_titles.csv')
data.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Data Cleaning

In [137]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [138]:
data.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [139]:
data.dropna(inplace=True)

In [140]:
data.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [141]:
data['listed_in'].value_counts()

listed_in
Dramas, International Movies                        336
Stand-Up Comedy                                     286
Comedies, Dramas, International Movies              257
Dramas, Independent Movies, International Movies    243
Children & Family Movies, Comedies                  179
                                                   ... 
Anime Series, Crime TV Shows, TV Horror               1
Romantic TV Shows, TV Comedies, TV Dramas             1
Anime Series, Romantic TV Shows, Teen TV Shows        1
TV Dramas                                             1
Documentaries, Dramas                                 1
Name: count, Length: 335, dtype: int64

In [142]:
# listed_in column is a comma separated string, split the string and get all the genres
# Split the 'listed_in' column on commas to create a list of genres
data['listed_in'] = data['listed_in'].apply(lambda x: x.split(', '))

# Explode the list into separate rows
data_exploded = data.explode('listed_in')

In [143]:
data_exploded

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,Dramas,"On a photo shoot in Ghana, an American model s..."
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,Independent Movies,"On a photo shoot in Ghana, an American model s..."
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,International Movies,"On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,British TV Shows,A talented batch of amateur bakers face off in...
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,Reality TV,A talented batch of amateur bakers face off in...
...,...,...,...,...,...,...,...,...,...,...,...,...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,Children & Family Movies,"Dragged from civilian life, a former superhero..."
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,Comedies,"Dragged from civilian life, a former superhero..."
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,Dramas,A scrappy but poor boy worms his way into a ty...
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,International Movies,A scrappy but poor boy worms his way into a ty...


In [144]:
data['country'].value_counts()

country
United States                                    1846
India                                             875
United Kingdom                                    183
Canada                                            107
Spain                                              91
                                                 ... 
United States, Sweden, Norway                       1
United States, Germany, United Kingdom, Italy       1
United States, New Zealand, United Kingdom          1
Finland, United States                              1
Spain, France, Uruguay                              1
Name: count, Length: 604, dtype: int64

In [145]:
# confusion Matrix for title and the listed_in
confusion_matrix = data.groupby(['title', 'listed_in']).size().unstack(fill_value=0)


TypeError: unhashable type: 'list'

## Text Processing on Description

In [130]:
import nltk
from nltk.tokenize import word_tokenize

# Ensure stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text)

# Assuming `data` is a pandas DataFrame
mined_data = {'Tokens': [], 'listed_in': []}

# Iterating over the DataFrame
for index, row in data_exploded.iterrows():
    # Tokenize the description
    words = tokenize_text(row['description'])
    
    # lowercase 
    words = [word.lower() for word in words]
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # remove punctuation
    import re
    words = [re.sub(r'\W+', '', word) for word in words if word.isalpha()]
    
    # Store the tokens and corresponding genre
    mined_data['Tokens'].append(words)
    mined_data['listed_in'].append(row['listed_in'])

# Create a DataFrame with the results
df = pd.DataFrame(mined_data)

# Display the DataFrame
df


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mahshid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mahshid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Tokens,listed_in
0,"[photo, shoot, ghana, american, model, slips, ...",Dramas
1,"[photo, shoot, ghana, american, model, slips, ...",Independent Movies
2,"[photo, shoot, ghana, american, model, slips, ...",International Movies
3,"[talented, batch, amateur, bakers, face, compe...",British TV Shows
4,"[talented, batch, amateur, bakers, face, compe...",Reality TV
...,...,...
11853,"[dragged, civilian, life, former, superhero, m...",Children & Family Movies
11854,"[dragged, civilian, life, former, superhero, m...",Comedies
11855,"[scrappy, poor, boy, worms, way, tycoon, dysfu...",Dramas
11856,"[scrappy, poor, boy, worms, way, tycoon, dysfu...",International Movies


In [132]:
df['listed_in'].value_counts()

listed_in
International Movies            2369
Dramas                          2293
Comedies                        1553
Action & Adventure               806
Independent Movies               740
Romantic Movies                  579
Thrillers                        547
Children & Family Movies         503
Documentaries                    391
Horror Movies                    336
Stand-Up Comedy                  294
Music & Musicals                 292
Sci-Fi & Fantasy                 236
Sports Movies                    156
Classic Movies                   108
International TV Shows            87
LGBTQ Movies                      80
Cult Movies                       69
Anime Features                    61
Faith & Spirituality              58
TV Dramas                         52
Crime TV Shows                    38
TV Comedies                       30
British TV Shows                  21
Romantic TV Shows                 21
Movies                            20
Docuseries                  

### Word Frequency

In [120]:
# frequency of words in each genre

def word_genre_freq(df):
    genre_word_freq = {}

    for index, row in df.iterrows():
        genre = row['Genre']
        tokens = row['Tokens']

        if genre not in genre_word_freq:
            genre_word_freq[genre] = Counter()

        genre_word_freq[genre].update(tokens)
    
    return genre_word_freq


In [121]:
genre_word_freq = word_genre_freq(df)

In [122]:
genre_word_freq['Dramas'].most_common(5)

[('young', 335), ('life', 232), ('man', 228), ('family', 202), ('woman', 194)]

In [125]:
genre_word_freq['Action & Adventure'].most_common(10)

[('young', 71),
 ('must', 67),
 ('man', 63),
 ('two', 61),
 ('life', 48),
 ('mission', 47),
 ('agent', 46),
 ('group', 44),
 ('new', 42),
 ('world', 42)]

### Word Cloud Visualization

In [85]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def plot_word_cloud(genre_word_freq, genre):  
    for genre, freq in genre_word_freq.items():
        wordcloud = WordCloud(width = 800, height = 800, 
                        background_color ='white', 
                        stopwords = set(stopwords.words('english')), 
                        min_font_size = 10).generate_from_frequencies(freq)

        plt.figure(figsize = (8, 8), facecolor = None) 
        plt.imshow(wordcloud) 
        plt.axis("off") 
        plt.tight_layout(pad = 0) 
        plt.title(genre)
        plt.show()

In [None]:
plot_word_cloud(genre_word_freq)

### Language Modeling