In [5]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [14]:
df = pd.read_csv('../data/processed/preprocessed_dataset.csv')
df.head()

Unnamed: 0,image_link,imdb_id,rating_count,rating,genre,keywords,trailer_link,actors,director,title,year,plot,processed_plot
0,https://m.media-amazon.com/images/M/MV5BMDliMm...,tt0172495,1610899,8.5,"[Action, Adventure, Drama]","[roman empire, gladiator, slavery, combat, bat...",https://www.imdb.com/video/vi2628367897/,"[Russell Crowe, Joaquin Phoenix, Connie Nielsen]",[Ridley Scott],Gladiator,2000,"Shouting ""Roma Invicta!"" as his forces attack,...",shouting rom invicta force attack general maxi...
1,https://m.media-amazon.com/images/M/MV5BZTcyNj...,tt0209144,1316937,8.4,"[Mystery, Thriller]","[memory, short term memory loss, hidden truth,...",https://www.imdb.com/video/vi3220356889/,"[Guy Pearce, Carrie-Anne Moss, Joe Pantoliano]",[Christopher Nolan],Memento,2000,"Leonard Shelby (Guy Pearce), a man whose short...",leonard shelby guy pearce man whose short term...
2,https://m.media-amazon.com/images/M/MV5BMTA2ND...,tt0208092,906115,8.2,"[Comedy, Crime]","[diamond, boxer, narrated by character, cockne...",https://www.imdb.com/video/vi1558577433/,"[Jason Statham, Brad Pitt, Stephen Graham]",[Guy Ritchie],Snatch,2000,After stealing an 86-carat (17 g) diamond in a...,stealing carat g diamond heist antwerp franki...
3,https://m.media-amazon.com/images/M/MV5BZTM2ZG...,tt0144084,709394,7.6,"[Crime, Drama, Horror]","[narcissism, materialism, serial murder, socio...",https://www.imdb.com/video/vi4060743449/,"[Christian Bale, Justin Theroux, Josh Lucas]",[Mary Harron],American Psycho,2000,A white background. Red drops begin to fall pa...,white background red drop begin fall past open...
4,https://m.media-amazon.com/images/M/MV5BNjUxYz...,tt0187393,292416,7.2,"[Action, Drama, History]","[american revolution, 18th century, british ar...",https://www.imdb.com/video/vi100139289/,"[Mel Gibson, Heath Ledger, Joely Richardson]",[Roland Emmerich],The Patriot,2000,"Benjamin Martin (Mel Gibson), a veteran of the...",benjamin martin mel gibson veteran french ind...


In [16]:
df.dropna(inplace=True)

In [17]:
kmeans = KMeans(n_clusters=10, random_state=42)
tf_idf_vectorizer = TfidfVectorizer()
bow_vectorizer = CountVectorizer()

In [18]:
def train_model(data, vectorizer):
    X = vectorizer.fit_transform(data)
    return kmeans.fit(X)

In [27]:
def find_similar_plot(input_plot, vectorizer, model):
    new_plot_vectorized = vectorizer.transform(input_plot)
    predicted_cluster = model.predict(new_plot_vectorized)

    similar_movies_indices = (model.labels_ == predicted_cluster).nonzero()[0]
    similar_movies = df.iloc[similar_movies_indices][['plot']]

    return similar_movies

## Using TF-IDF

In [20]:
model = train_model(df['processed_plot'], tf_idf_vectorizer)



In [51]:
print(df['plot'][0])

Shouting "Roma Invicta!" as his forces attack, General Maximus Decimus Meridius (Russell Crowe) leads the Roman Army to victory against Germanic barbarians in the year 180 A.D., ending a prolonged war and earning the esteem of elderly Emperor Marcus Aurelius. Although the dying Aurelius has a son, Commodus (Joaquin Phoenix), he decides to appoint temporary leadership to the morally-upstanding Maximus, with a desire to eventually return power to the Roman Senate. Aurelius informs Maximus and offers him time to consider before informing Commodus, who, in a bout of jealousy, murders his father.Declaring himself the emperor, Commodus asks Maximus for his loyalty, which Maximus, realizing Commodus' involvement in the Emperor's death, refuses. Commodus orders Maximus' execution and dispatches Praetorian Guards to murder Maximus' wife and son. Maximus narrowly escapes his execution and races home only to discover his family's charred and crucified bodies in the smoldering ruins of his villa. 

In [61]:
txt = ["victory against Germanic barbarians"]

In [62]:
find_similar_plot(txt, tf_idf_vectorizer, model).iloc[0]['plot']

'Shouting "Roma Invicta!" as his forces attack, General Maximus Decimus Meridius (Russell Crowe) leads the Roman Army to victory against Germanic barbarians in the year 180 A.D., ending a prolonged war and earning the esteem of elderly Emperor Marcus Aurelius. Although the dying Aurelius has a son, Commodus (Joaquin Phoenix), he decides to appoint temporary leadership to the morally-upstanding Maximus, with a desire to eventually return power to the Roman Senate. Aurelius informs Maximus and offers him time to consider before informing Commodus, who, in a bout of jealousy, murders his father.Declaring himself the emperor, Commodus asks Maximus for his loyalty, which Maximus, realizing Commodus\' involvement in the Emperor\'s death, refuses. Commodus orders Maximus\' execution and dispatches Praetorian Guards to murder Maximus\' wife and son. Maximus narrowly escapes his execution and races home only to discover his family\'s charred and crucified bodies in the smoldering ruins of his v

## Using Bag of Words

In [56]:
model = train_model(df['processed_plot'], bow_vectorizer)



In [63]:
find_similar_plot(txt, bow_vectorizer, model).iloc[0]['plot']

'Shouting "Roma Invicta!" as his forces attack, General Maximus Decimus Meridius (Russell Crowe) leads the Roman Army to victory against Germanic barbarians in the year 180 A.D., ending a prolonged war and earning the esteem of elderly Emperor Marcus Aurelius. Although the dying Aurelius has a son, Commodus (Joaquin Phoenix), he decides to appoint temporary leadership to the morally-upstanding Maximus, with a desire to eventually return power to the Roman Senate. Aurelius informs Maximus and offers him time to consider before informing Commodus, who, in a bout of jealousy, murders his father.Declaring himself the emperor, Commodus asks Maximus for his loyalty, which Maximus, realizing Commodus\' involvement in the Emperor\'s death, refuses. Commodus orders Maximus\' execution and dispatches Praetorian Guards to murder Maximus\' wife and son. Maximus narrowly escapes his execution and races home only to discover his family\'s charred and crucified bodies in the smoldering ruins of his v