In [1]:
import pandas as pd
import numpy as np
import hjson

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import re

nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from get_dataframes import GetDataframes
from tools import import_config, import_datasets
from scipy.sparse import hstack

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
config = import_config()
datas = GetDataframes(config)
df = datas.get_machine_learning_dataframe(
    "machine_learning",
    # modify=True
)

2023-11-10 18:21:48 INFO     Creating machine_learning dataframe...
2023-11-10 18:21:48 INFO     Parquet loaded ! Importing tmdb_updated...
2023-11-10 18:21:49 INFO     Parquet loaded ! Importing actors_movies...
2023-11-10 18:21:49 INFO     Parquet loaded ! Importing directors_movies...
2023-11-10 18:21:49 INFO     Parquet loaded ! Importing movies_cleaned...
2023-11-10 18:21:49 INFO     Creating machine_learning dataframe...
2023-11-10 18:21:49 INFO     Merging machine_learning dataframe...
2023-11-10 18:21:49 INFO     Droping NaN machine_learning dataframe...
2023-11-10 18:21:49 INFO     Process Overview...
2023-11-10 18:23:33 INFO     Writing machine_learning dataframe...
2023-11-10 18:23:33 INFO     Dataframe machine_learning ready to use!


In [3]:
def full_lower(
    text: str
):
    return text.replace(" ", "").replace("-", "")

tt = (
    # ("titre_genres", "titre_genres"),
    ("actors", "actors"),
    ("titre_genres", "titre_genres"),
    ("directors", "directors"),
)
for t in tt:
    df[t[0]] = df[t[1]].apply(full_lower)


# df["overview"] = df["overview"].apply(lambda x : x.replace(" ", ","))


In [5]:
print(df[df["titre_str"].str.contains("The Avengers")].to_markdown())

|      |   index | titre_id   | titre_str    | titre_genres   |   rating_avg |   rating_votes | actors                                                    | directors   | overview                                                                                                                                                                                                                |
|-----:|--------:|:-----------|:-------------|:---------------|-------------:|---------------:|:----------------------------------------------------------|:------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 4330 |    4330 | tt0848228  | The Avengers | Action,SciFi   |            8 |        1437952 | RobertDowneyJr.,ScarlettJohansson,ChrisEvans,JeremyRenner | JossWhedon  | unexpected enemy emerges threatens global safety secur

In [6]:
print(df[df["titre_id"].str.contains("tt2395417")].to_markdown())


|      |   index | titre_id   | titre_str   | titre_genres   |   rating_avg |   rating_votes | actors                                             | directors      | overview                                           |
|-----:|--------:|:-----------|:------------|:---------------|-------------:|---------------:|:---------------------------------------------------|:---------------|:---------------------------------------------------|
| 6147 |    6148 | tt2395417  | Still Life  | Drama          |          7.4 |           8708 | KarenDrury,EddieMarsan,JoanneFroggatt,AndrewBuchan | UbertoPasolini | council case worker look relative found dead alone |


In [8]:
def combine(r):
    return r["titre_genres"]+" "+r["directors"]+" "+r["overview"]+" "+r["actors"] #r["titre_str"]+" "+

df["one_for_all"] = df.apply(combine, axis=1)

In [10]:
def titre_index(titre: str):
    return df[df.titre_str == titre].index[0]

def director_index(director: str):
    return df[df.directors.str.contains(director)].index[0]

def actor_index(actor: str):
    return df[df.actors.str.contains(actor)].index[0]

def idx_titre(idx: int):
    return df[df.index == idx]["titre_str"].values[0]

def idx_actor(idx: int):
    return df[df.index == idx]["actors"].values[0]

def idx_titre_id(idx: int):
    return df[df.index == idx]["titre_id"].values[0]

In [11]:
# tidf = TfidfVectorizer()
# combined_matrix = tidf.fit_transform(
#     df["titre_str"]+" "+df["actors"]+" "+df["titre_genres"]+" "+df["directors"]+" "+df["overview"]
# )
# tidf = TfidfVectorizer()
# combined_matrix = tidf.fit_transform(
#     df["one_for_all"]+" "+df["titre_str"]
# )
top = 5

poids_ = {
    # "titre_str":    2,
    "titre_genres": 2,
    "actors":       1.5,
    "directors":    1.5,
    "overview":     2.5,
    # "rating_avg":   0.2,
    # "rating_votes": 0.2,
}

full_matrix = []
for col, poids in poids_.items():
    tfidf_ = TfidfVectorizer()
    matrix_ = tfidf_.fit_transform(df[col]) * poids
    full_matrix.append(matrix_)

combined_matrix = hstack(full_matrix)

cosine = cosine_similarity(combined_matrix)

movies = "The Avengers"
mov_idx = titre_index(movies)

similar = cosine[mov_idx]
similar1 = list(enumerate(cosine[mov_idx]))

sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)
sim_mov_idx = similar.argsort()[::-1][1:top+1]

same_movies = df.loc[sim_mov_idx, "titre_str"]
ttconst = df.loc[sim_mov_idx, "titre_id"]

sim_scores[1:top+1]
score = [i[1] for i in sim_scores]

print(f"Top 10 similar movies to {movies} are :\n")
for movies, tt,  score in zip(same_movies, ttconst, score):
    print(f"Movie : {movies} | id : {tt} | score : {np.round(score, 4)}")


Top 10 similar movies to The Avengers are :

Movie : Avengers: Age of Ultron | id : tt2395427 | score : 1.0
Movie : Serenity | id : tt0379786 | score : 0.5193
Movie : Captain America: Civil War | id : tt3498820 | score : 0.3935
Movie : Snowpiercer | id : tt1706620 | score : 0.3811
Movie : Captain America: The Winter Soldier | id : tt1843866 | score : 0.3373


In [16]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer

movies = "Fight Club"

cv = CountVectorizer()
count_matrix = cv.fit_transform(df['one_for_all'])
movie_user_likes = movies
movie_index = titre_index(movie_user_likes)

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(count_matrix)
dist, indices = knn_model.kneighbors(count_matrix[movie_index], n_neighbors=15)

print(f"Top 10 similar movies to {movie_user_likes} are :\n")
for index, dis in zip(indices.flatten()[1:], dist.flatten()[1:]):
    print(f"Movie : {idx_titre(index)} | id : {idx_titre_id(index)} | score : {np.round(dis, 4)}")


Top 10 similar movies to Fight Club are :

Movie : Breakfast at Tiffany's | id : tt0054698 | score : 0.8499
Movie : The Naked Kiss | id : tt0058390 | score : 0.853
Movie : Kafka | id : tt0102181 | score : 0.8561
Movie : After Lucia | id : tt2368749 | score : 0.8629
Movie : Project Almanac | id : tt2436386 | score : 0.8629
Movie : Racing with the Moon | id : tt0087968 | score : 0.8645
Movie : First Reformed | id : tt6053438 | score : 0.8685
Movie : Never Back Down | id : tt1023111 | score : 0.8744
Movie : Blazing Saddles | id : tt0071230 | score : 0.8757
Movie : Coming to America | id : tt0094898 | score : 0.8757
Movie : The Angels' Share | id : tt1924394 | score : 0.8779
Movie : The Curious Case of Benjamin Button | id : tt0421715 | score : 0.8794
Movie : Mermaids | id : tt0100140 | score : 0.8795
Movie : Vikram Vedha | id : tt6148156 | score : 0.8799


In [13]:

# random_model = RandomForestClassifier().fit(combined_matrix, df["titre_genres"])
# for idx, movies in zip(sim_mov_idx, same_movies):
#     mo = combined_matrix[idx].toarray()
#     prediction = random_model.predict(mo)[0]
#     confidence = np.max(random_model.predict_proba(mo))

#     print("movie :", movies)
#     print("confidence :", confidence)

In [14]:
# Movie : Avengers: Age of Ultron | id : tt2395427 | score : 1.0
# Movie : The Gray Man | id : tt1649418 | score : 0.2522
# Movie : Avengers: Endgame | id : tt4154796 | score : 0.1226
# Movie : Snowpiercer | id : tt1706620 | score : 0.1209
# Movie : Kingsman: The Secret Service | id : tt2802144 | score : 0.1117
# print(f"Top 10 similar movies to {movie_user_likes} are :\n")
#

In [15]:
# poids_ = {
#     "titre_str": 0.1,
#     "titre_genres": 0.1,
#     "directors": 0.2,
#     "overview": 0.2,
#     "actor_1": 0.15,
#     "actor_2": 0.1,
    # "actor_3": 0.025,
#     "actor_4": 0.025,
# }

# full_matrix = []
# for col, poids in poids_.items():
#     tfidf_ = TfidfVectorizer()
#     matrix_ = tfidf_.fit_transform(df[col]) * poids
#     full_matrix.append(matrix_)

# combined_matrix = hstack(full_matrix)

# cosine = cosine_similarity(combined_matrix)

# movies = movie
# mov_idx = title_index(movies)
# similar = cosine[mov_idx]
# similar1 = list(enumerate(cosine[mov_idx]))
# sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)

# sim_mov_idx = similar.argsort()[::-1][1:11]
# same_movies = df.loc[sim_mov_idx, "titre_str"]
# sim_scores[1:11]
# score = [i[1] for i in sim_scores]
# for movies, score in zip(same_movies, score):
#     print(movies, score)


# # random_model = RandomForestClassifier().fit(matrix, df["titre_genres"])
# # for idx, movies in zip(sim_mov_idx, same_movies):
# #     mo = matrix[idx].toarray()
# #     prediction = random_model.predict(mo)[0]
# #     confidence = np.max(random_model.predict_proba(mo))

# #     print("movie :", movies)
# #     print("confidence :", confidence)