In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/training

/content/drive/MyDrive/Masters AUA/Spring 2022/Machine Learning/training


In [3]:
import pandas as pd
import numpy as np

import plotly.express as px

In [4]:
imdb_df = pd.read_csv('data/imdb_encoded.csv')

In [5]:
imdb_df.shape

(183967, 46)

In [6]:
imdb_df.head()

Unnamed: 0,actor1,actor2,actor3,actor4,director_enc,action,adult,adventure,animation,biography,...,duration,imdb_rating,votes,release_start,release_month,tv_series,title,synopsis,director,actors
0,37241,68568,10147,59050,33767,0,0,0,0,0,...,87.0,7.6,13192.0,1950.0,1,0,Gun Crazy,Two disturbed young people release their fasci...,Joseph H. Lewis,"['John Dall', 'Peggy Cummins', 'Berry Kroeger'..."
1,63173,22342,29027,25837,23232,0,0,0,0,0,...,81.0,6.3,919.0,1950.0,1,0,The Nevadan,A mysterious stranger crosses paths with an ou...,Gordon Douglas,"['Randolph Scott', 'Dorothy Malone', 'Forrest ..."
2,25854,72711,46756,13293,50165,0,0,0,0,0,...,98.0,6.7,4206.0,1950.0,1,0,Whirlpool,A woman suffering from kleptomania is hypnotiz...,Otto Preminger,"['Gene Tierney', 'Richard Conte', 'José Ferrer..."
3,65473,74085,15914,12858,22254,0,0,0,0,0,...,83.0,5.6,480.0,1950.0,1,0,The Sundowners,Brother is pitted against brother in this tale...,George Templeton,"['Robert Preston', 'Robert Sterling', 'Chill W..."
4,32322,21601,43406,69447,6356,0,0,0,0,0,...,84.0,6.8,2113.0,1950.0,1,0,The Blue Lamp,The daily routine of two London Policemen is i...,Basil Dearden,"['Jack Warner', 'Dirk Bogarde', 'Jimmy Hanley'..."


In [7]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors'],
      dtype='object')

# Feature Engineering

## Topic modeling

In [8]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.10.0-py2.py3-none-any.whl (58 kB)
[?25l[K     |█████▋                          | 10 kB 17.2 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 11.5 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 9.3 MB/s eta 0:00:01[K     |██████████████████████▍         | 40 kB 8.5 MB/s eta 0:00:01[K     |████████████████████████████    | 51 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 2.9 MB/s 
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 4.3 MB/s 
[?25hCollecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 6.0 MB/s 
[?25hCollecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 32.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting

In [9]:
import re
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

from bertopic import BERTopic

In [10]:
# deleting rows with no synopsis
imdb_df = imdb_df[imdb_df['synopsis'] != 'Add a Plot'].dropna(subset=['synopsis']).reset_index(drop=True)

In [11]:
imdb_df.shape

(161602, 46)

In [12]:

# Removing punctuation
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', x))
# Converting the text to lowercase
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: x.lower())
# Removing 'see full summary'
imdb_df['synopsis'] = imdb_df['synopsis'].map(lambda x: re.sub('see full summary\xa0»', '', x))
# Deleting unnecessary spaces
imdb_df['synopsis'] = imdb_df['synopsis'].str.strip()

In [13]:
# Lemmatization
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

imdb_df['synopsis_lemmatized'] = imdb_df['synopsis'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [14]:
# Removing stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
             
data = imdb_df['synopsis_lemmatized'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## BERTopic

In [45]:
model = BERTopic(min_topic_size=50, n_gram_range=(1,3), verbose=True)

In [46]:
docs = data_words
docs = np.array([(" ").join(i) for i in docs])
# topics = model.fit(docs)

In [47]:
labels, probs = model.fit_transform(docs)

Batches:   0%|          | 0/5051 [00:00<?, ?it/s]

2022-05-14 10:51:24,090 - BERTopic - Transformed documents to Embeddings
2022-05-14 10:56:05,413 - BERTopic - Reduced dimensionality
2022-05-14 10:56:36,476 - BERTopic - Clustered reduced embeddings


In [48]:
imdb_df['topic'] = labels

In [49]:
# import pickle
# filename = 'topic_modeling_model.sav'
# pickle.dump(model, open(filename, 'wb'))

In [50]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,95882
1,0,4835
2,1,4671
3,2,3811
4,3,3112
...,...,...
137,136,53
138,137,53
139,138,52
140,139,51


In [51]:
imdb_df.to_csv('data/imdb_encoded_with_topics.csv', index=False)

In [52]:
model.visualize_barchart(top_n_topics=12)

In [53]:
model.embedding_model

<bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7fea7aa82d10>

In [54]:
model.find_topics('computer')

([115, 67, 106, 83, 26],
 [0.7771494604087915,
  0.637647628800309,
  0.6227392594915524,
  0.5440665893491932,
  0.5284670549243317])

In [55]:
model.save("my_model_without_embedding", save_embedding_model=False)

In [56]:
# import pickle
# filename = 'topic_modeling_model.sav'
# pickle.dump(model, open(filename, 'wb'))

In [57]:
model.topic_names[0]

'0_martial_hong_japanese_japan'

In [58]:
imdb_df[imdb_df.topic == 0]

Unnamed: 0,actor1,actor2,actor3,actor4,director_enc,action,adult,adventure,animation,biography,...,votes,release_start,release_month,tv_series,title,synopsis,director,actors,synopsis_lemmatized,topic
50,67235,79751,25033,29984,59942,0,0,0,0,0,...,47.0,1950.0,1,0,Akatsuki no dasso,mikami a japanese soldier serving in china is ...,Senkichi Taniguchi,"['Ryô Ikebe', 'Shirley Yamaguchi', 'Eitarô Oza...","[mikami, a, japanese, soldier, serving, in, ch...",0
84,30440,40313,97157,32290,26035,0,0,0,0,0,...,95.0,1949.0,1,0,Ai le zhongnian,in precommunism era of china a widowed man mus...,Hu Sang,"['Hui Shi', 'Jiachen Zhu', 'Yang Shen', 'Huanq...","[in, precommunism, era, of, china, a, widowed,...",0
135,81795,92766,57622,48306,39945,0,0,0,0,0,...,19.0,1949.0,1,0,Li ren xing,three women in japaneseoccupied shanghai 1941 ...,Liting Chen,"['Zhongshi Gao', 'Zongying Huang', 'Ma Lan', '...","[three, woman, in, japaneseoccupied, shanghai,...",0
248,80936,23432,71085,1326,63750,0,0,0,0,0,...,168.0,1950.0,3,0,Mata au hi made,the hero saburo okada eiji is obstructed by a ...,Tadashi Imai,"['Yoshiko Kuga', 'Eiji Okada', 'Osamu Takizawa...","[the, hero, saburo, okada, eiji, is, obstructe...",0
570,76848,53845,61887,78635,1103,0,0,0,0,0,...,165215.0,1950.0,8,0,Rashômon,the rape of a bride and the murder of her samu...,Akira Kurosawa,"['Toshirô Mifune', 'Machiko Kyô', 'Masayuki Mo...","[the, rape, of, a, bride, and, the, murder, of...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161524,40415,33805,89444,87466,26836,1,0,1,1,0,...,89.0,2022.0,4,0,Shokei Shoujo no Virgin Road,the lost ones are wanderers who come here from...,Iori Saeki,"['Kahara Moe', 'Hisako Kanemoto', 'Tamie Kubota']","[the, lost, one, are, wanderer, who, come, her...",0
161540,47710,93092,99348,87466,69620,0,0,0,0,0,...,74.0,2022.0,4,0,Qie Shi Tian Xia,the story of the feng lan xi and feng xi yun j...,Yang Yang,['Lusi Zhao'],"[the, story, of, the, feng, lan, xi, and, feng...",0
161562,43515,73271,83096,87466,49360,0,0,0,1,0,...,32.0,2022.0,4,0,Deaimon,nagomu irino returns to his kyoto home for the...,Nobunaga Shimazaki,"['Kozue Yuuki', 'Rikiya Koyama', 'Sayaka Ôhara']","[nagomu, irino, return, to, his, kyoto, home, ...",0
161578,68175,72405,26129,71865,26759,0,0,0,0,0,...,10.0,2022.0,4,0,DJS the Movie: Biarkan Aku Menari,in wulans life she only wanted to be a dancer ...,Indrayanto Kurniawan,"['Sandrinna Michelle', 'Rey Bong', 'Emiliano F...","[in, wulans, life, she, only, wanted, to, be, ...",0


In [59]:
imdb_df.columns

Index(['actor1', 'actor2', 'actor3', 'actor4', 'director_enc', 'action',
       'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'game-show',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'unknown', 'war', 'western', 'link', 'genre', 'duration',
       'imdb_rating', 'votes', 'release_start', 'release_month', 'tv_series',
       'title', 'synopsis', 'director', 'actors', 'synopsis_lemmatized',
       'topic'],
      dtype='object')