In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# %cd /content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/imdb

In [1]:
import pandas as pd
import numpy as np

import plotly.express as px

In [2]:
imdb_df = pd.read_csv('data/imdb_encoded.csv')

In [3]:
imdb_df.shape

(183967, 43)

In [4]:
imdb_df.head()

Unnamed: 0,actor1,actor2,actor3,actor4,director,action,adult,adventure,animation,biography,...,western,duration,imdb_rating,votes,release_start,release_month,tv_series,title,synopsis,actors
0,37241,68568,10147,59050,Joseph H. Lewis,0,0,0,0,0,...,0,87.0,7.6,13192.0,1950.0,1,0,Gun Crazy,Two disturbed young people release their fasci...,"['John Dall', 'Peggy Cummins', 'Berry Kroeger'..."
1,63173,22342,29027,25837,Gordon Douglas,0,0,0,0,0,...,1,81.0,6.3,919.0,1950.0,1,0,The Nevadan,A mysterious stranger crosses paths with an ou...,"['Randolph Scott', 'Dorothy Malone', 'Forrest ..."
2,25854,72711,46756,13293,Otto Preminger,0,0,0,0,0,...,0,98.0,6.7,4206.0,1950.0,1,0,Whirlpool,A woman suffering from kleptomania is hypnotiz...,"['Gene Tierney', 'Richard Conte', 'José Ferrer..."
3,65473,74085,15914,12858,George Templeton,0,0,0,0,0,...,1,83.0,5.6,480.0,1950.0,1,0,The Sundowners,Brother is pitted against brother in this tale...,"['Robert Preston', 'Robert Sterling', 'Chill W..."
4,32322,21601,43406,69447,Basil Dearden,0,0,0,0,0,...,0,84.0,6.8,2113.0,1950.0,1,0,The Blue Lamp,The daily routine of two London Policemen is i...,"['Jack Warner', 'Dirk Bogarde', 'Jimmy Hanley'..."


# Feature Engineering

## Topic modeling

In [5]:
# !pip install bertopic

In [6]:
import re
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

from bertopic import BERTopic

In [7]:
# deleting rows with no synopsis
imdb_df = imdb_df[imdb_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [8]:
imdb_df.shape

(161602, 43)

In [9]:

# Removing punctuation
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
imdb_df['synopsis'] = imdb_df['synopsis'].str.strip()

In [11]:
# Lemmatization
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

imdb_df['synopsis_lemmatized'] = imdb_df['synopsis'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /home/hem/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [12]:
# Removing stopwords
# nltk.download('stopwords')
stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
             
data = imdb_df['synopsis_lemmatized'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

## BERTopic

In [13]:
model = BERTopic(min_topic_size=50, n_gram_range=(1,3), verbose=True)

In [14]:
docs = data_words
docs = np.array([(" ").join(i) for i in docs])
# topics = model.fit(docs)

In [15]:
labels, probs = model.fit_transform(docs)

Batches: 100%|██████████| 5051/5051 [16:25<00:00,  5.13it/s]
2022-05-12 17:38:24,632 - BERTopic - Transformed documents to Embeddings


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
imdb_df['topic'] = labels

In [18]:
import pickle
filename = 'topic_modeling_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [19]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,97934
1,0,4351
2,1,3811
3,2,2440
4,3,2255
...,...,...
142,141,54
143,142,53
144,143,52
145,144,52


In [20]:
imdb_df.to_csv('imdb_with_topics.csv', index=False)

In [21]:
%cd /content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/imdb

/content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/imdb


In [22]:
import pickle
filename = 'topic_modeling_model.sav'
model = pickle.load(open(filename, 'rb'))

In [26]:
model.visualize_barchart(top_n_topics=12)

In [27]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,97934
1,0,4351
2,1,3811
3,2,2440
4,3,2255
...,...,...
142,141,54
143,142,53
144,143,52
145,144,52


In [28]:
model.find_topics('santa')

([14, 11, 9, 101, 6],
 [0.8368458016998195,
  0.5456008266233818,
  0.4975201430573063,
  0.4970034316063777,
  0.4946339519691568])