In [39]:
import sys
import dotenv
import os
dotenv.load_dotenv()
sys.path.append(os.getenv('MAINDIR'))
from main import MoviesDatabase

In [40]:
synopsis = [movie.description for movie in MoviesDatabase.query.all()]
genres = [movie.genre for movie in MoviesDatabase.query.all()]
titles = [movie.title for movie in MoviesDatabase.query.all()]
collection = [movie.collection for movie in MoviesDatabase.query.all()]
critics = [movie.computed_critic_score for movie in MoviesDatabase.query.all()]
studio = [movie.studio for movie in MoviesDatabase.query.all()]

In [41]:
synopsis[0]

"Imprisoned on the other side of the universe, the mighty Thor finds himself in a deadly gladiatorial contest that pits him against the Hulk, his former ally and fellow Avenger. Thor's quest for survival leads him in a race against time to prevent the all-powerful Hela from destroying his home world and the Asgardian civilization."

In [42]:
from transformers import AutoTokenizer, AutoModel
from bert_transformer import BertTransformer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained("bert-base-uncased").to('cuda:0')

bert_transformer = BertTransformer(tokenizer, model, 500)

In [43]:
vectorized = bert_transformer.fit_transform([synopsis])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(50).fit([vectorized])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 9887) + inhomogeneous part.

In [None]:
kmeans.labels_

array([ 5, 45, 16, ..., 12,  6, 32], dtype=int32)

In [None]:
import numpy as np

np.asarray(synopsis).shape

(9887,)

In [None]:
import pandas as pd

data = {'Cluster': kmeans.labels_, 'Synopsis': synopsis, 'Genres': genres, 'Title': titles, 'Collection': collection, 'Critics': critics, 'Studio': studio}
df = pd.DataFrame(data)

In [None]:
df.Studio.value_counts()

Studio
                                                                    2779
Paramount Pictures                                                   405
Warner Bros. Pictures                                                391
Universal Pictures                                                   377
20th Century Fox                                                     373
                                                                    ... 
Warner Bros. Pictures, MGM/UA Home Entertainment Inc.                  1
Roadshow Home Video [au], Metro-Goldwyn-Mayer Distributing Corp.       1
Brandon Films Inc., Cowboy Pictures                                    1
Cinelou Films                                                          1
Paramount+                                                             1
Name: count, Length: 1077, dtype: int64

In [None]:
df[df['Cluster'] == 1].Synopsis

360     Aaron answers an online ad and drives to a str...
506     A unique and darkly engaging spiritual thrille...
522     Being stranded on a deserted island leaves you...
606     After a gentle alien becomes stranded on Earth...
701     The Mayan kingdom is at the height of its opul...
                              ...                        
9322    Father Merrin (Stellan Skarsgard) is haunted b...
9324    After his true love (Isabel Glasser) falls in ...
9507    A less-than-successful exercise equipment sale...
9795    After his ship sinks, Andrew Braddock (Michael...
9829    Upon arriving at a remote cabin in the redwood...
Name: Synopsis, Length: 162, dtype: object

In [None]:
n = 18
len(df[df['Cluster'] == n]), dict(df[df['Cluster'] == n].Genres.value_counts()[:5]), dict(df[df['Cluster'] == n].Collection.value_counts()[:5]), df[df['Cluster'] == n]['Title'].to_numpy()

(81,
 {'Documentary': 53,
  'Documentary, Music': 10,
  'Biography': 4,
  'Drama': 4,
  'Comedy, Stand-up': 2},
 {'': 81},
 array(['Hannah Gadsby: Nanette', 'Dolemite Is My Name',
        'Hot Girls Wanted', '22 July', 'Andy Irons: Kissed by God',
        'Reversing Roe', "Won't You Be My Neighbor?", 'Sexy Baby',
        'A Poem Is a Naked Person', 'A Most Beautiful Thing',
        'Rolling Thunder Revue: A Bob Dylan Story by Martin Scorsese',
        'Taylor Swift: Miss Americana', 'The Kingmaker',
        'Rumble: The Indians Who Rocked the World',
        'This Changes Everything', 'Grindhouse',
        '3 1/2 Minutes, Ten Bullets', '13TH',
        'The Man Nobody Knew: In Search of My Father, CIA Spymaster William Colby',
        'This Is Elvis', 'The Act of Killing', 'A Quiet Passion',
        'Bowling for Columbine', 'Jerrod Carmichael: Rothaniel', 'Mishima',
        'The Grizzlies', 'Girl 27', 'The Plot Against the President',
        'The Hurricane', 'The Trip to Greece', 'The 