In [1]:
import pandas as pd
import numpy as np

import plotly.express as px

In [9]:
import re
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

from bertopic import BERTopic

In [5]:
imdb_df = pd.read_csv('../data/imdb_encoded.csv')

In [6]:
imdb_df.shape

(183967, 46)

In [7]:
imdb_df.head()

Unnamed: 0,actor1,actor2,actor3,actor4,director_enc,action,adult,adventure,animation,biography,...,duration,imdb_rating,votes,release_start,release_month,tv_series,title,synopsis,director,actors
0,37241,68568,10147,59050,33767,0,0,0,0,0,...,87.0,7.6,13192.0,1950.0,1,0,Gun Crazy,Two disturbed young people release their fasci...,Joseph H. Lewis,"['John Dall', 'Peggy Cummins', 'Berry Kroeger'..."
1,63173,22342,29027,25837,23232,0,0,0,0,0,...,81.0,6.3,919.0,1950.0,1,0,The Nevadan,A mysterious stranger crosses paths with an ou...,Gordon Douglas,"['Randolph Scott', 'Dorothy Malone', 'Forrest ..."
2,25854,72711,46756,13293,50165,0,0,0,0,0,...,98.0,6.7,4206.0,1950.0,1,0,Whirlpool,A woman suffering from kleptomania is hypnotiz...,Otto Preminger,"['Gene Tierney', 'Richard Conte', 'José Ferrer..."
3,65473,74085,15914,12858,22254,0,0,0,0,0,...,83.0,5.6,480.0,1950.0,1,0,The Sundowners,Brother is pitted against brother in this tale...,George Templeton,"['Robert Preston', 'Robert Sterling', 'Chill W..."
4,32322,21601,43406,69447,6356,0,0,0,0,0,...,84.0,6.8,2113.0,1950.0,1,0,The Blue Lamp,The daily routine of two London Policemen is i...,Basil Dearden,"['Jack Warner', 'Dirk Bogarde', 'Jimmy Hanley'..."


In [8]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors'],
      dtype='object')

## Data cleaning

In [15]:
# deleting rows with no synopsis
imdb_df = imdb_df[imdb_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [16]:
imdb_df.shape

(161602, 47)

In [17]:

# Removing punctuation
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
imdb_df['synopsis'] = imdb_df['synopsis'].str.strip()

In [18]:
# Lemmatization
# nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

imdb_df['synopsis_lemmatized'] = imdb_df['synopsis'].apply(lemmatize_text)


In [19]:
# Removing stopwords
# nltk.download('stopwords')
stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
             
data = imdb_df['synopsis_lemmatized'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

## BERTopic

In [20]:
model = BERTopic(min_topic_size=50, n_gram_range=(1,3), verbose=True)

In [21]:
docs = data_words
docs = np.array([(" ").join(i) for i in docs])

In [22]:
labels, probs = model.fit_transform(docs)

In [48]:
imdb_df['topic'] = labels

In [51]:
imdb_df.to_csv('data/ded_with_topics.csv', index=False)

In [52]:
model.visualize_barchart(top_n_topics=12)

In [55]:
model.save("../models/topic_model", save_embedding_model=False)