In [1]:

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import re

nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_parquet("machine_learning_v2.parquet")

In [3]:
df[df.isna().any(axis=1)]
df.dropna(inplace=True)

In [4]:
df

Unnamed: 0,titre_id,titre_str,titre_genres,rating_avg,rating_votes,actors,directors,overview
0,tt0035423,Kate & Leopold,"[Comedy, Fantasy, Romance]",6.4,87897,"[Meg Ryan, Hugh Jackman, Liev Schreiber, Breck...",[James Mangold],When her scientist ex-boyfriend discovers a po...
1,tt0052832,The Fugitive Kind,"[Drama, Romance]",7.1,6914,"[Anna Magnani, Marlon Brando, Joanne Woodward,...",[Sidney Lumet],"Val Xavier, a drifter of obscure origins, arri..."
2,tt0052997,The League of Gentlemen,"[Comedy, Crime, Drama]",7.2,5105,"[Jack Hawkins, Roger Livesey, Nigel Patrick, R...",[Basil Dearden],Involuntarily-retired Colonel Hyde recruits se...
3,tt0053459,Eyes Without a Face,"[Drama, Horror]",7.6,34278,"[Pierre Brasseur, Alexandre Rignault, Alida Va...",[Georges Franju],Dr. Génessier is riddled with guilt after an a...
4,tt0053579,Late Autumn,"[Comedy, Drama]",7.9,5854,"[Setsuko Hara, Keiji Sada, Mariko Okada, Yôko ...",[Yasujirô Ozu],A woman and her daughter are each forced to co...
...,...,...,...,...,...,...,...,...
7411,tt9873892,They Cloned Tyrone,"[Comedy, Mystery, Sci-Fi]",6.6,33197,"[Kiefer Sutherland, Jamie Foxx, John Boyega, T...",[Juel Taylor],A series of eerie events thrusts an unlikely t...
7412,tt9883996,Dream Horse,"[Biography, Comedy, Drama]",6.9,6589,"[Alan David, Lynda Baron, Owen Teale, Toni Col...",[Euros Lyn],"The inspiring true story of Dream Alliance, an..."
7413,tt9893250,I Care a Lot,"[Comedy, Crime, Drama]",6.4,141372,"[Dianne Wiest, Rosamund Pike, Peter Dinklage, ...",[J Blakeson],A court-appointed legal guardian defrauds her ...
7414,tt9900782,Kaithi,"[Action, Adventure, Crime]",8.5,39546,"[Narain, Karthi, George Maryan, Arjun Das]",[Lokesh Kanagaraj],"Dilli, a convicted criminal, is out on parole ..."


In [5]:
tt = (
    ("actors", "actors"),
    ("titre_genres", "titre_genres"),
    ("directors", "directors"),
)
for t in tt:
    df[t[0]] = df[t[1]].apply(lambda x: ", ".join(map(str, x))).replace(" ", "")

In [6]:
def clean_overview(
    text: str
):
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df['overview'] = df['overview'].astype(str).apply(clean_overview)

In [7]:
def full_lower(
    text: str
):
    return text.replace(" ", "")

tt = (
    ("actors", "actors"),
    ("titre_genres", "titre_genres"),
    ("directors", "directors"),
)
for t in tt:
    df[t[0]] = df[t[1]].apply(full_lower)


In [8]:
df

Unnamed: 0,titre_id,titre_str,titre_genres,rating_avg,rating_votes,actors,directors,overview
0,tt0035423,Kate & Leopold,"Comedy,Fantasy,Romance",6.4,87897,"MegRyan,HughJackman,LievSchreiber,BreckinMeyer",JamesMangold,scientist ex boyfriend discovers portal travel...
1,tt0052832,The Fugitive Kind,"Drama,Romance",7.1,6914,"AnnaMagnani,MarlonBrando,JoanneWoodward,Mauree...",SidneyLumet,val xavier drifter obscure origin arrives smal...
2,tt0052997,The League of Gentlemen,"Comedy,Crime,Drama",7.2,5105,"JackHawkins,RogerLivesey,NigelPatrick,RichardA...",BasilDearden,involuntarily retired colonel hyde recruit sev...
3,tt0053459,Eyes Without a Face,"Drama,Horror",7.6,34278,"PierreBrasseur,AlexandreRignault,AlidaValli,Ju...",GeorgesFranju,dr g nessier riddled guilt accident caused dis...
4,tt0053579,Late Autumn,"Comedy,Drama",7.9,5854,"SetsukoHara,KeijiSada,MarikoOkada,YôkoTsukasa",YasujirôOzu,woman daughter forced contend increasing press...
...,...,...,...,...,...,...,...,...
7411,tt9873892,They Cloned Tyrone,"Comedy,Mystery,Sci-Fi",6.6,33197,"KieferSutherland,JamieFoxx,JohnBoyega,TeyonahP...",JuelTaylor,series eerie event thrust unlikely trio onto t...
7412,tt9883996,Dream Horse,"Biography,Comedy,Drama",6.9,6589,"AlanDavid,LyndaBaron,OwenTeale,ToniCollette",EurosLyn,inspiring true story dream alliance unlikely r...
7413,tt9893250,I Care a Lot,"Comedy,Crime,Drama",6.4,141372,"DianneWiest,RosamundPike,PeterDinklage,EizaGon...",JBlakeson,court appointed legal guardian defrauds older ...
7414,tt9900782,Kaithi,"Action,Adventure,Crime",8.5,39546,"Narain,Karthi,GeorgeMaryan,ArjunDas",LokeshKanagaraj,dilli convicted criminal parole meet daughter ...


In [9]:
df.to_parquet("machine_learning_nltk_list.parquet")

In [10]:
print(df[df["titre_str"].str.contains("The Avengers")].to_markdown())


|      | titre_id   | titre_str    | titre_genres   |   rating_avg |   rating_votes | actors                                                    | directors   | overview                                                                                                                                                                                                                |
|-----:|:-----------|:-------------|:---------------|-------------:|---------------:|:----------------------------------------------------------|:------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 4194 | tt0848228  | The Avengers | Action,Sci-Fi  |            8 |        1437952 | RobertDowneyJr.,ScarlettJohansson,ChrisEvans,JeremyRenner | JossWhedon  | unexpected enemy emerges threatens global safety security nick fury director interna

In [11]:
print(df[df["titre_id"].str.contains("tt2395417")].to_markdown())


|      | titre_id   | titre_str   | titre_genres   |   rating_avg |   rating_votes | actors                                             | directors      | overview                                           |
|-----:|:-----------|:------------|:---------------|-------------:|---------------:|:---------------------------------------------------|:---------------|:---------------------------------------------------|
| 5953 | tt2395417  | Still Life  | Drama          |          7.4 |           8708 | KarenDrury,EddieMarsan,JoanneFroggatt,AndrewBuchan | UbertoPasolini | council case worker look relative found dead alone |


In [12]:
movie = "Oppenheimer"

In [13]:
# feat = [
#     'titre_str',
#     'titre_genres',
#     # 'rating_avg',
#     # 'rating_votes',
#     "actors",
#     'directors',
# ]
# # df.columns.to_list()
# feat

# for f in feat:
#     df[f] = df[f].replace(" ", "")

In [14]:
# def combine(r):
#     return r["titre_str"]+" "+r["titre_genres"]+" "+r["directors"]+" "+r["overview"]+" "+r["actors"]

# df["one_for_all"] = df.apply(combine, axis=1)

In [15]:
def title_index(titre):
    return df[df.titre_str == titre].index[0]

def idx_title(idx):
    return df[df.index == idx]["titre_str"].values[0]

In [16]:
# tidf = TfidfVectorizer()
# matrix = tidf.fit_transform(
#     df["titre_str"]+" "+df["actors"]+" "+df["titre_genres"]+" "+df["directors"]+" "+df["overview"]
# )
# combined_matrix = tidf.fit_transform(
#     df["one_for_all"]
# )

In [17]:
poids_ = {
    "overview":     0.2,
    "titre_str":    0.2,
    "actors":       0.2,
    "titre_genres": 0.2,
    "directors":    0.2,
}

full_matrix = []
for col, poids in poids_.items():
    tfidf_ = TfidfVectorizer()
    matrix_ = tfidf_.fit_transform(df[col]) #* poids
    full_matrix.append(matrix_)

combined_matrix = hstack(full_matrix)

cosine = cosine_similarity(combined_matrix)

movies = movie
mov_idx = title_index(movies)
similar = cosine[mov_idx]
similar1 = list(enumerate(cosine[mov_idx]))
sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)

sim_mov_idx = similar.argsort()[::-1][1:11]
same_movies = df.loc[sim_mov_idx, "titre_str"]
ttconst = df.loc[sim_mov_idx, "titre_id"]
sim_scores[1:11]
score = [i[1] for i in sim_scores]
for movies, tt,  score in zip(same_movies, ttconst, score):
    print(f"Movie : {movies} | id : {tt} | score : {np.round(score, 4)}")


# random_model = RandomForestClassifier().fit(matrix, df["titre_genres"])
# for idx, movies in zip(sim_mov_idx, same_movies):
#     mo = matrix[idx].toarray()
#     prediction = random_model.predict(mo)[0]
#     confidence = np.max(random_model.predict_proba(mo))

#     print("movie :", movies)
#     print("confidence :", confidence)

Movie : Chor Nikal Ke Bhaga | id : tt22297828 | score : 1.0
Movie : The Crossing Guard | id : tt0112744 | score : 0.309
Movie : Terrified | id : tt7549892 | score : 0.2634
Movie : The Matador | id : tt0365485 | score : 0.2525
Movie : Wheels | id : tt2170667 | score : 0.2185
Movie : Belle | id : tt2404181 | score : 0.2173
Movie : 2 Days in the Valley | id : tt0115438 | score : 0.215
Movie : Crash | id : tt0375679 | score : 0.2137
Movie : The Dream Team | id : tt0097235 | score : 0.2099
Movie : A Shock to the System | id : tt0100602 | score : 0.2097


In [18]:
# poids_ = {
#     "titre_str": 0.1,
#     "titre_genres": 0.1,
#     "directors": 0.2,
#     "overview": 0.2,
#     "actor_1": 0.15,
#     "actor_2": 0.1,
    # "actor_3": 0.025,
#     "actor_4": 0.025,
# }

# full_matrix = []
# for col, poids in poids_.items():
#     tfidf_ = TfidfVectorizer()
#     matrix_ = tfidf_.fit_transform(df[col]) * poids
#     full_matrix.append(matrix_)

# combined_matrix = hstack(full_matrix)

# cosine = cosine_similarity(combined_matrix)

# movies = movie
# mov_idx = title_index(movies)
# similar = cosine[mov_idx]
# similar1 = list(enumerate(cosine[mov_idx]))
# sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)

# sim_mov_idx = similar.argsort()[::-1][1:11]
# same_movies = df.loc[sim_mov_idx, "titre_str"]
# sim_scores[1:11]
# score = [i[1] for i in sim_scores]
# for movies, score in zip(same_movies, score):
#     print(movies, score)


# # random_model = RandomForestClassifier().fit(matrix, df["titre_genres"])
# # for idx, movies in zip(sim_mov_idx, same_movies):
# #     mo = matrix[idx].toarray()
# #     prediction = random_model.predict(mo)[0]
# #     confidence = np.max(random_model.predict_proba(mo))

# #     print("movie :", movies)
# #     print("confidence :", confidence)