In [6]:
import pandas as pd
import numpy as np

import plotly.express as px

In [7]:
imdb_df = pd.read_csv('data/imdb_encoded.csv')

In [8]:
imdb_df.shape

(183967, 43)

In [9]:
imdb_df.head()

Unnamed: 0,actor1,actor2,actor3,actor4,director,action,adult,adventure,animation,biography,...,western,duration,imdb_rating,votes,release_start,release_month,tv_series,title,synopsis,actors
0,37241,68568,10147,59050,Joseph H. Lewis,0,0,0,0,0,...,0,87.0,7.6,13192.0,1950.0,1,0,Gun Crazy,Two disturbed young people release their fasci...,"['John Dall', 'Peggy Cummins', 'Berry Kroeger'..."
1,63173,22342,29027,25837,Gordon Douglas,0,0,0,0,0,...,1,81.0,6.3,919.0,1950.0,1,0,The Nevadan,A mysterious stranger crosses paths with an ou...,"['Randolph Scott', 'Dorothy Malone', 'Forrest ..."
2,25854,72711,46756,13293,Otto Preminger,0,0,0,0,0,...,0,98.0,6.7,4206.0,1950.0,1,0,Whirlpool,A woman suffering from kleptomania is hypnotiz...,"['Gene Tierney', 'Richard Conte', 'José Ferrer..."
3,65473,74085,15914,12858,George Templeton,0,0,0,0,0,...,1,83.0,5.6,480.0,1950.0,1,0,The Sundowners,Brother is pitted against brother in this tale...,"['Robert Preston', 'Robert Sterling', 'Chill W..."
4,32322,21601,43406,69447,Basil Dearden,0,0,0,0,0,...,0,84.0,6.8,2113.0,1950.0,1,0,The Blue Lamp,The daily routine of two London Policemen is i...,"['Jack Warner', 'Dirk Bogarde', 'Jimmy Hanley'..."


# Feature Engineering

## Topic modeling

In [10]:
# !pip install bertopic

In [11]:
import re
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

from bertopic import BERTopic

In [12]:
# deleting rows with no synopsis
imdb_df = imdb_df[imdb_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [13]:
imdb_df.shape

(161602, 43)

In [14]:

# Removing punctuation
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
imdb_df['synopsis'] = imdb_df['synopsis'].str.strip()

In [15]:
# Removing stopwords
# nltk.download('stopwords')
stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
             
data = imdb_df['synopsis'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

## BERT

In [16]:
model = BERTopic(min_topic_size=30, n_gram_range=(1,3), verbose=True)

In [17]:
docs = data_words
docs = np.array([(" ").join(i) for i in docs])
# topics = model.fit(docs)

In [18]:
labels, probs = model.fit_transform(docs)

In [None]:
imdb_df['topic'] = labels

In [None]:
# imdb_df.to_csv('/content//drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/Project/imdb_df.csv')

In [None]:
# imdb_df.to_csv('/content//drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/Project/imdb_df.csv', index=False)

In [None]:
# import pickle
# filename = '/content//drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/Project/topic_modeling_model.sav'
# pickle.dump(model, open(filename, 'wb'))


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [None]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,115959
1,0,1013
2,1,763
3,2,561
4,3,556
...,...,...
6673,7710,3
6672,7711,3
6671,7712,3
6670,7703,3


In [None]:
model.visualize_topics()

In [27]:
# topics.visualize_topics()

In [28]:
# topics.visualize_barchart(top_n_topics=12)