In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

import plotly.express as px

In [2]:
imdb_df = pd.read_csv('data/final_imdb.csv')
imdb_clean = pd.DataFrame()

In [3]:
imdb_df.shape

(312521, 14)

In [4]:
imdb_df.head()

Unnamed: 0,actors,director,duration,genre,imdb_rating,link,synopsis,title,votes,page_url,page_url_cleaned,release_start,release_month,tv_series
0,"['John Dall', 'Peggy Cummins', 'Berry Kroeger'...",Joseph H. Lewis,87.0,"Crime, Drama, Film-Noir",7.6,https://www.imdb.com/title/tt0042530/,Two disturbed young people release their fasci...,Gun Crazy,13192.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1,0
1,"['Raoul Walsh', 'Errol Flynn', 'Alexis Smith',...",Ray Enright,76.0,Western,6.1,https://www.imdb.com/title/tt0042744/,An Australian sheepman comes to Montana lookin...,Montana,1215.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1,0
2,"['Randolph Scott', 'Dorothy Malone', 'Forrest ...",Gordon Douglas,81.0,Western,6.3,https://www.imdb.com/title/tt0042782/,A mysterious stranger crosses paths with an ou...,The Nevadan,919.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1,0
3,"['Gene Tierney', 'Richard Conte', 'José Ferrer...",Otto Preminger,98.0,"Crime, Drama, Film-Noir",6.7,https://www.imdb.com/title/tt0042039/,A woman suffering from kleptomania is hypnotiz...,Whirlpool,4206.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1,0
4,"['Robert Preston', 'Robert Sterling', 'Chill W...",George Templeton,83.0,Western,5.6,https://www.imdb.com/title/tt0043013/,Brother is pitted against brother in this tale...,The Sundowners,480.0,https://www.imdb.com/search/title/?release_dat...,https://www.imdb.com/search/title/?release_dat...,1950.0,1,0


# Encoding

## Actors

In [5]:
# Actors
print('Unique elements', (imdb_df['actors'].nunique()))
# Calculating the number of top cast
n_actors = [len(eval(imdb_df['actors'][i])) for i in range(imdb_df.shape[0])]
imdb_df['n_actors'] = n_actors
px.histogram(imdb_df, 'n_actors')

Unique elements 287807


In [6]:
# As there are films, where the actors' tag was not correctly identified (actors instead of the directors), the number of actors is huge.
# I will drop the films which have more than 7 actors, and don't have actors at all
print(imdb_df.shape)
imdb_df = imdb_df[(imdb_df.n_actors <= 4) & (imdb_df.n_actors != 0)].reset_index(drop=True)
print(imdb_df.shape)

(312521, 15)
(277967, 15)


In [7]:
px.histogram(imdb_df, 'n_actors')

In [8]:
imdb_df['actors'] = [eval(i) for i in imdb_df['actors']]
imdb_clean[['actor1', 'actor2', 'actor3', 'actor4']] = pd.DataFrame(imdb_df['actors'].tolist())

In [9]:
for i in ['actor1', 'actor2', 'actor3', 'actor4']:
    # creating instance of labelencoder
    labelencoder = LabelEncoder()
    # Assigning numerical values and storing in another column
    imdb_clean[i] = labelencoder.fit_transform(imdb_clean[i])

In [10]:
imdb_clean.describe()

Unnamed: 0,actor1,actor2,actor3,actor4
count,277967.0,277967.0,277967.0,277967.0
mean,61195.458,69066.077829,74389.588401,72229.946803
std,34987.187583,39413.263328,42085.399311,36240.416365
min,0.0,0.0,0.0,0.0
25%,31003.5,35177.0,38217.0,40539.5
50%,61124.0,69016.0,74661.0,80436.0
75%,91846.0,103377.0,111569.5,109379.0
max,121412.0,131974.0,137173.0,109379.0


## Director

In [11]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
imdb_clean['director'] = labelencoder.fit_transform(imdb_df['director'])

In [12]:
imdb_clean

Unnamed: 0,actor1,actor2,actor3,actor4,director
0,55016,97085,13731,73544,49418
1,93259,31026,39521,31712,33799
2,37704,103090,64231,16239,73749
3,96700,104953,21570,15711,32329
4,47936,29996,59786,86615,9312
...,...,...,...,...,...
277962,37786,107107,5397,2560,23853
277963,91134,113540,103311,106341,77751
277964,47563,15714,8961,9933,12494
277965,75617,35822,109649,109379,88617


## Genre

In [13]:
genres = [i.split(', ') if i == i else ['Unknown'] for i in imdb_df['genre']]
mlb = MultiLabelBinarizer()
imdb_clean[mlb.classes_] = pd.DataFrame(mlb.fit_transform(genres), columns = mlb.classes_)


In [14]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'release_month', 'tv_series', 'n_actors'],
      dtype='object')

## Creating final df

In [15]:
imdb_clean[['duration', 'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series', 'title', 'synopsis']] = \
    imdb_df[['duration', 'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series', 'title', 'synopsis']]

In [16]:
imdb_clean.isna().sum()

actor1               0
actor2               0
actor3               0
actor4               0
director             0
Action               0
Adult                0
Adventure            0
Animation            0
Biography            0
Comedy               0
Crime                0
Documentary          0
Drama                0
Family               0
Fantasy              0
Film-Noir            0
Game-Show            0
History              0
Horror               0
Music                0
Musical              0
Mystery              0
News                 0
Reality-TV           0
Romance              0
Sci-Fi               0
Short                0
Sport                0
Talk-Show            0
Thriller             0
Unknown              0
War                  0
Western              0
duration         49807
imdb_rating      69794
votes            69794
release_start      121
release_month        0
tv_series            0
title                1
synopsis             5
dtype: int64

In [21]:
imdb_clean.to_csv('data/imdb_clean.csv', index=False)

# Feature Engineering

## Topic modeling

In [17]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import re
from pprint import pprint

In [18]:
# deleting rows with no synopsis
imdb_clean = imdb_clean[imdb_clean['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [19]:
imdb_clean.shape

(196712, 42)

In [20]:
# Removing punctuation
imdb_clean['synopsis'] = imdb_clean['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
imdb_clean['synopsis'] = imdb_clean['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
imdb_clean['synopsis'] = imdb_clean['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
imdb_clean['synopsis'] = imdb_clean['synopsis'].str.strip()

In [21]:
# Removing stopwords
stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
             
data = imdb_clean['synopsis'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

In [22]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


In [25]:
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the num_topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.006*"young" + 0.005*"video" + 0.005*"music" + 0.005*"love" + 0.005*"life" '
  '+ 0.004*"one" + 0.004*"father" + 0.004*"two" + 0.004*"story" + '
  '0.004*"woman"'),
 (1,
  '0.011*"family" + 0.011*"life" + 0.010*"love" + 0.009*"two" + 0.006*"story" '
  '+ 0.005*"years" + 0.005*"man" + 0.005*"young" + 0.004*"one" + 0.004*"film"'),
 (2,
  '0.007*"new" + 0.007*"show" + 0.006*"series" + 0.005*"two" + 0.005*"world" + '
  '0.004*"one" + 0.004*"group" + 0.003*"young" + 0.003*"school" + '
  '0.003*"life"'),
 (3,
  '0.012*"young" + 0.008*"man" + 0.007*"life" + 0.007*"one" + 0.007*"woman" + '
  '0.004*"find" + 0.004*"get" + 0.004*"friends" + 0.004*"girl" + '
  '0.003*"finds"'),
 (4,
  '0.006*"story" + 0.006*"young" + 0.006*"new" + 0.005*"police" + '
  '0.005*"lives" + 0.004*"life" + 0.004*"two" + 0.003*"wife" + 0.003*"man" + '
  '0.003*"one"')]


In [26]:
# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda_model, corpus, id2word)
lda_viz


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


## BERT

In [18]:
from sentence_transformers import SentenceTransformer


data = imdb_clean['synopsis']
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(data, show_progress_bar=True)

Downloading: 100%|██████████| 690/690 [00:00<00:00, 205kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 193kB/s]
Downloading: 100%|██████████| 3.99k/3.99k [00:00<00:00, 1.05MB/s]
Downloading: 100%|██████████| 550/550 [00:00<00:00, 146kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 68.4kB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 60.8kB/s]
Downloading: 100%|██████████| 265M/265M [00:32<00:00, 8.27MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 9.18kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 39.1kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 500kB/s] 
Downloading: 100%|██████████| 450/450 [00:00<00:00, 107kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 280kB/s]  
Batches:   9%|▊         | 745/8687 [18:01<4:38:21,  2.10s/it]