In [50]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import pandas as pd
import numpy as np
import hjson
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import process
from unicodedata import normalize, combining
from datetime import datetime, timedelta

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from get_dataframes import GetDataframes
from tools import import_config, import_datasets, check_titre, color
from scipy.sparse import hstack

pd.set_option("display.max_columns", None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
config = import_config()

In [52]:
name = "clean_datasets/machine_learning_final.parquet"
df = import_datasets(name, "parquet")

2023-11-15 08:33:35 INFO     Parquet loaded ! Importing machine_learning_final...


In [54]:
def titre_index(titre: str):
    return df[df.titre_str == titre].index[0]


def director_index(director: str):
    return df[df.directors.str.contains(director)].index[0]


def actor_index(actor: str):
    return df[df.actors.str.contains(actor)].index[0]


def idx_titre(idx: int):
    return df[df.index == idx]["titre_str"].values[0]


def idx_actor(idx: int):
    return df[df.index == idx]["actors"].values[0]


def idx_titre_id(idx: int):
    return df[df.index == idx]["titre_id"].values[0]


def idx_popularity(idx: int):
    return df[df.index == idx]["popularity"].values[0]


def idx_keywords(idx: int):
    return df[df.index == idx]["keywords"].values[0]


def idx_image(idx: int):
    return df[df.index == idx]["image"].values[0]


def idx_youtube(idx: int):
    return df[df.index == idx]["youtube"].values[0]


def idx_url(idx: int):
    return df[df.index == idx]["url"].values[0]


def check_titre_str(d: pd.DataFrame, movie: str):
    return df[df["titre_str"].str.contains(movie)]

In [57]:
check_titre(df, "ring")

Unnamed: 0,titre_id,titre_str,titre_genres,actors,director,keywords,tmdb_id,overview,popularity,date,rating_avg,rating_vote,url,image,youtube,titre_clean
783,tt0178868,Ring,"horreur,thriller","nanakomatsushima,hiroyukisanada,rikiyaotaka",hideonakata,"urbanlegend,videotape",2671,soir seules a maison deux lyceennes font peur ...,19.06,1998,7.05,1280,https://www.imdb.com/title/tt0178868,https://image.tmdb.org/t/p/w500/yyGZFjVLSBvTqU...,https://www.youtube.com/watch?v=Sd26Z-pF1Ok,ring


In [58]:
def combine(r):
    return (
        r["keywords"]
        + " "
        + r["actors"]
        + " "
        + r["director"]
        + " "
        +
        # r["overview"]+" "+
        r["titre_genres"]
        # str(r["date_only"])
        # str(r["popularity"])
        # str(r["rating_avg"])+ " "+
        # str(r["rating_vote"])
    )


df["one_for_all"] = df.apply(combine, axis=1)

In [59]:
def get_best_match_index_tfidf(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et TfidfVectorizer
    """
    best_match = process.extract(movies, df["titre_clean"].values, limit=10)
    best_candidate = [match[0] for match in best_match]
    print("best_matches", best_match)
    print("best_candidate", best_candidate)
    print("movies", movies)

    small_df = df[df["titre_clean"].isin(best_candidate)]
    print(
        small_df[
            ["titre_id", "titre_str", "titre_genres", "keywords"]
        ].to_markdown()
    )
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(best_candidate)
    tfidf_ = tfidf.transform([movies])

    cosine_similarities = cosine_similarity(tfidf_, matrix).flatten()

    best_match_idx = cosine_similarities.argmax()
    best_match_titre = best_candidate[best_match_idx]
    print("best_match_idx", best_match_idx)
    print("best_match_titre", best_match_titre)
    return df[df["titre_clean"] == best_match_titre].index[0]

In [60]:
def get_best_match_index_knn(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et Nearest Neighbors
    """
    # best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_match = process.extractOne(movies, df["titre_clean"].values)
    # best_candidate = [match[0] for match in best_match]
    # print("best_matches :\n",best_match[0])
    # print("best_candidate :\n",best_candidate)
    small_df = df[df["titre_clean"] == best_match[0]]
    # print(small_df[["titre_id", "titre_str", "titre_genres", "keywords"]].to_markdown())
    # small_df = df[df['titre_clean'].isin(best_candidate)]
    return df[df["titre_clean"] == best_match[0]].index[0]
    print(
        small_df[
            ["titre_id", "titre_str", "titre_genres", "keywords"]
        ].to_markdown()
    )
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(small_df["titre_clean"].values)

    knn = NearestNeighbors(n_neighbors=1).fit(matrix)
    vector = tfidf.transform([movies])

    print("query_vector\n", vector)

    dist, idx = knn.kneighbors(vector, return_distance=True)

    best_match_idx = idx[0][0]
    best_match_titre = small_df["titre_clean"].iloc[best_match_idx]
    print("best_match_idx :", best_match_idx)
    print("best_match_titre :", best_match_titre)
    print()
    return small_df[small_df["titre_clean"] == best_match_titre].index[0]

In [61]:
def get_best_match_index_rf(movies: str, df: pd.DataFrame):
    # Je capte pas le fonctionnement, j'ai besoin de plus de recherche
    raise NotImplementedError
    """
    Utilisation de FuzzyWuzzy et RandomForestClassifier
    """

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(df["titre_clean"].values)
    y = df["titre_clean"].values

    y_encoded = LabelEncoder().fit_transform(y)
    rf = RandomForestClassifier().fit(X, y_encoded)

    vector = tfidf.transform([movies])
    prediction = rf.predict(vector)

    predict = y_encoded.inverse_transform(prediction)[0]
    return df[df["titre_clean"] == predict].index[0]

In [62]:
def pick_algo(movies: str, df: pd.DataFrame, algo: str = "tfidf"):
    movies = (
        movies.replace(" ", "")
        .replace("-", "")
        .replace("'", "")
        .replace(":", "")
        .lower()
    )
    if algo == "tfidf":
        return get_best_match_index_tfidf(movies, df)
    elif algo == "knn":
        return get_best_match_index_knn(movies, df)
    elif algo == "rf":
        return get_best_match_index_rf(movies, df)

In [63]:
def tfidf_algo(
    df: pd.DataFrame, movies: str, top: int = 10, algo: str = "tfidf"
):
    poids_ = {
        "titre_genres": 0.2,
        "actors": 0.15,
        "director": 0.15,
        "overview": 0.2,
        "keywords": 0.3,
    }

    full_matrix = []
    for col, poids in poids_.items():
        tfidf_ = TfidfVectorizer()
        matrix_ = tfidf_.fit_transform(df[col]) * poids
        full_matrix.append(matrix_)

    combined_matrix = hstack(full_matrix)
    cosine = cosine_similarity(combined_matrix)
    mov_idx = pick_algo(movies, df, algo)

    similar = cosine[mov_idx]
    similar1 = list(enumerate(cosine[mov_idx]))

    sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)
    sim_mov_idx = similar.argsort()[::-1][1 : top + 1]

    same_movies = df.loc[sim_mov_idx, "titre_str"]

    sim_scores[1 : top + 1]
    score = [i[1] for i in sim_scores]

    # imdb_url = "https://www.imdb.com/title/"
    # tmdb_image = "https://image.tmdb.org/t/p/w500"
    poster = f"Poster : {idx_image(mov_idx)}\n"

    print(color("~" * len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~" * len(poster), "red"))
    print()
    for movies_, idx in zip(same_movies, sim_mov_idx):
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n"
            + f"IMdb link : {idx_url(idx)}\n"
            + f"Poster : {idx_image(idx)}\n"
            + f"Youtube : {idx_youtube(idx)}\n"
        )
        line = cmt.split("\n")
        print(cmt + color("-" * len(max(line, key=len)), "green"))

In [64]:
def cv_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df["one_for_all"])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    mov_idx = pick_algo(movies, df, algo)

    sim_scores = list(enumerate(cosine_sim[mov_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1 : top + 1]
    movie_indices = [i[0] for i in sim_scores]

    # imdb_url = "https://www.imdb.com/title/"
    # tmdb_image = "https://image.tmdb.org/t/p/w500"
    poster = f"Poster : {idx_image(mov_idx)}\n"

    print(color("~" * len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~" * len(poster), "red"))
    print()
    for idx in movie_indices:
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n"
            + f"IMdb link : {idx_url(idx)}\n"
            + f"Poster : {idx_image(idx)}\n"
            + f"Youtube : {idx_youtube(idx)}\n"
        )
        line = cmt.split("\n")
        print(cmt + color("-" * len(max(line, key=len)), "green"))

In [65]:
def knn_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df["one_for_all"])
    mov_idx = pick_algo(movies, df, algo)

    knn_model = NearestNeighbors(metric="cosine", algorithm="brute").fit(
        count_matrix
    )
    dist, indices = knn_model.kneighbors(
        count_matrix[mov_idx], n_neighbors=top + 1
    )

    poster = f"Poster : {idx_image(mov_idx)}\n"
    print(color("~" * len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~" * len(poster), "red"))
    print()
    for idx, dis in zip(indices.flatten()[1:], dist.flatten()[1:]):
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n"
            + f"IMdb link : {idx_url(idx)}\n"
            + f"Poster : {idx_image(idx)}\n"
            + f"Youtube : {idx_youtube(idx)}\n"
        )
        line = cmt.split("\n")
        print(cmt + color("-" * len(max(line, key=len)), "green"))

AUTOPLAY : 
https://www.youtube.com/embed/MJ3Up7By5cw?autoplay=1&autohide=2&border=0&wmode=opaque&enablejsapi=1&modestbranding=1&controls=0&showinfo=1&mute=1

In [66]:
movies = "fight club"

In [67]:
tfidf_algo(df, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Fight Club are :
Popularity 78.346
IMdb link : https://www.imdb.com/title/tt0137523
Poster : https://image.tmdb.org/t/p/w500/t1i10ptOivG4hV7erkX3tmKpiqm.jpg
Youtube : https://www.youtube.com/watch?v=tZpXdiB_pg0
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : The Social Network | popularity 42.296
IMdb link : https://www.imdb.com/title/tt1285016
Poster : https://image.tmdb.org/t/p/w500/cvUfwhoAReL4e5eegFCHM73rIda.jpg
Youtube : https://www.youtube.com/watch?v=SbQ19-ePch8
[38;5;2m------------------------------------------------------------------------[0m
Movie : Cake | popularity 18.407
IMdb link : https://www.imdb.com/title/tt3442006
Poster : https://image.tmdb.org/t/p/w500/dAo1FQVCav0kxUoMpAltIvQwaEj.jpg
Youtube : https://www.youtube.com/watch?v=Dv0QEJq6cLw
[38;5;2m--------------------------------------------------------------

In [68]:
knn_algo(df, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Fight Club are :
Popularity 78.346
IMdb link : https://www.imdb.com/title/tt0137523
Poster : https://image.tmdb.org/t/p/w500/t1i10ptOivG4hV7erkX3tmKpiqm.jpg
Youtube : https://www.youtube.com/watch?v=tZpXdiB_pg0
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : Seven | popularity 62.046
IMdb link : https://www.imdb.com/title/tt0114369
Poster : https://image.tmdb.org/t/p/w500/h4X4tJFxJobAImCMekVZXUpJVJC.jpg
Youtube : https://www.youtube.com/watch?v=ObOexjW0dgY
[38;5;2m------------------------------------------------------------------------[0m
Movie : Brooklyn Affairs | popularity 19.256
IMdb link : https://www.imdb.com/title/tt0385887
Poster : https://image.tmdb.org/t/p/w500/lI0pdnSsyQvWcJa1k1mFMFZNbMY.jpg
Youtube : https://www.youtube.com/watch?v=yeAItVQa6ok
[38;5;2m---------------------------------------------------------------

In [69]:
cv_algo(df, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Fight Club are :
Popularity 78.346
IMdb link : https://www.imdb.com/title/tt0137523
Poster : https://image.tmdb.org/t/p/w500/t1i10ptOivG4hV7erkX3tmKpiqm.jpg
Youtube : https://www.youtube.com/watch?v=tZpXdiB_pg0
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : Seven | popularity 62.046
IMdb link : https://www.imdb.com/title/tt0114369
Poster : https://image.tmdb.org/t/p/w500/h4X4tJFxJobAImCMekVZXUpJVJC.jpg
Youtube : https://www.youtube.com/watch?v=ObOexjW0dgY
[38;5;2m------------------------------------------------------------------------[0m
Movie : Brooklyn Affairs | popularity 19.256
IMdb link : https://www.imdb.com/title/tt0385887
Poster : https://image.tmdb.org/t/p/w500/lI0pdnSsyQvWcJa1k1mFMFZNbMY.jpg
Youtube : https://www.youtube.com/watch?v=yeAItVQa6ok
[38;5;2m---------------------------------------------------------------

In [70]:
# random_forest_algo(df, movies, top=5)