In [None]:

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import pandas as pd
import numpy as np
import hjson
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import process
from unicodedata import normalize, combining
from datetime import datetime

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from get_dataframes import GetDataframes
from tools import import_config, import_datasets, check_titre
from scipy.sparse import hstack

pd.set_option('display.max_columns', None)

In [None]:
# tmd = import_datasets("clean_datasets/tmdb_updated.parquet", "parquet")

In [None]:
config = import_config()
datas = GetDataframes(config)
df = datas.get_dataframes(
    "machine_learning",
    cleaned=True
)

In [None]:
df.columns


In [None]:
df["release_date"] = pd.to_datetime(df["release_date"])

In [None]:
df["release_date"] = df["release_date"].apply(lambda x : x.strftime("%Y"))

In [None]:
# tmd = import_datasets("clean_datasets/tmdb_updated.parquet", "parquet")

In [None]:
df["titre_clean"] = df["titre_str"]
df["titre_clean"] = df["titre_clean"].apply(lambda x : x.lower())

In [None]:
def clean_overview(
    text: str
) -> str:
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


In [None]:
def supprimer_accents(texte):
    texte_clean = normalize('NFKD', texte)
    return "".join(
        [c for c in texte_clean if not combining(c)]
    )

tt = [
    "actors",
    "titre_genres",
    "directors",
    "titre_clean",
]
for t in tt:
    df[t] = df[t].astype(str).apply(supprimer_accents)

In [None]:
print("cleaning overview")
df['overview'] = df['overview'].astype(str).apply(clean_overview)
# print("cleaning titre_clean")
# df['titre_clean'] = df['titre_clean'].astype(str).apply(clean_overview)

In [None]:
def full_lower(text: str):
    return text.replace(" ", "").replace("-", "").lower()

tt = [
    "actors",
    "titre_genres",
    "directors",
    "titre_clean",
]
for t in tt:
    print(f"lowering everything in {t}")
    if "titre_clean" in t:
        df[t] = df[t].apply(lambda x : x.replace(":", " "))
        tt.remove(t)
    else:
        df[t] = df[t].apply(full_lower)

In [None]:
dfff = df.copy()
dfff.reset_index(drop='index', inplace=True)
name = "machine_learning.csv"
dfff.to_csv(name, index=False)

In [None]:
name = "machine_learning.csv"
dff = pd.read_csv(name)

In [None]:
def titre_index(titre: str):
    return dff[dff.titre_str == titre].index[0]

def director_index(director: str):
    return dff[dff.directors.str.contains(director)].index[0]

def actor_index(actor: str):
    return dff[dff.actors.str.contains(actor)].index[0]

def idx_titre(idx: int):
    return dff[dff.index == idx]["titre_str"].values[0]

def idx_actor(idx: int):
    return dff[dff.index == idx]["actors"].values[0]

def idx_titre_id(idx: int):
    return dff[dff.index == idx]["titre_id"].values[0]

def idx_poster_path(idx: int):
    return dff[dff.index == idx]["poster_path"].values[0]

def idx_popularity(idx: int):
    return dff[dff.index == idx]["popularity"].values[0]

def check_titre_str(d: pd.DataFrame, movie: str):
    return dff[dff["titre_str"].str.contains(movie)]

In [None]:
check_titre(dff, "oppenheimer")

In [None]:
def combine(r):
    return (
        r["titre_genres"]+" "+
        r["directors"]+" "+
        r["overview"]+" "+
        r["actors"]+" "+
        str(r["release_date"])+" "+
        str(r["popularity"])+" "+
        str(r["revenue"])
        # str(r["rating_avg"])+ " "+
        # str(r["rating_votes"])
    )

dff["one_for_all"] = dff.apply(combine, axis=1)

In [None]:
def get_best_match_index_tfidf(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et TfidfVectorizer
    """
    # movies = movies.lower().replace(" ", "")
    best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_candidate = [match[0] for match in best_match]
    print("best_matches",best_match)
    print("best_candidate",best_candidate)
    print("movies",movies)

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(best_candidate)
    tfidf_ = tfidf.transform([movies])

    cosine_similarities = cosine_similarity(tfidf_, matrix).flatten()

    best_match_idx = cosine_similarities.argmax()
    best_match_titre = best_candidate[best_match_idx]
    print("best_match_idx",best_match_idx)
    print("best_match_titre",best_match_titre)
    return df[df['titre_clean'] == best_match_titre].index[0]


In [None]:
def get_best_match_index_knn(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et Nearest Neighbors
    """
    best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_candidate = [match[0] for match in best_match]
    print("best_matches :\n",best_match)
    print("best_candidate :\n",best_candidate)
    print()

    small_df = df[df['titre_clean'].isin(best_candidate)]
    print(small_df[["titre_id", "titre_str", "titre_genres"]].to_markdown())
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(small_df['titre_clean'].values)

    knn = NearestNeighbors(n_neighbors=1).fit(matrix)
    vector = tfidf.transform([movies])

    print("query_vector\n",vector)

    dist, idx = knn.kneighbors(vector, return_distance=True)

    best_match_idx = idx[0][0]
    best_match_titre = small_df['titre_clean'].iloc[best_match_idx]
    print("best_match_idx :",best_match_idx)
    print("best_match_titre :",best_match_titre)
    print()
    return small_df[small_df['titre_clean'] == best_match_titre].index[0]

In [None]:
def get_best_match_index_rf(movies: str, df: pd.DataFrame):
    # Je capte pas le fonctionnement, j'ai besoin de plus de recherche
    raise NotImplementedError
    """
    Utilisation de FuzzyWuzzy et RandomForestClassifier
    """

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(df['titre_clean'].values)
    y = df['titre_clean'].values

    y_encoded = LabelEncoder().fit_transform(y)
    rf = RandomForestClassifier().fit(X, y_encoded)

    vector = tfidf.transform([movies])
    prediction = rf.predict(vector)

    predict = y_encoded.inverse_transform(prediction)[0]
    return df[df['titre_clean'] == predict].index[0]

In [None]:
def pick_algo(movies: str, df: pd.DataFrame, algo: str = "tfidf"):
    if algo == "tfidf":
        return get_best_match_index_tfidf(movies, df)
    elif algo == "knn":
        return get_best_match_index_knn(movies, df)
    elif algo == "rf":
        return get_best_match_index_rf(movies, df)

In [None]:
dff

In [None]:
def tfidf_algo(df:pd.DataFrame, movies: str, top: int = 10, algo: str = "tfidf"):
    poids_ = {
        "titre_genres": 2,
        "actors":       1.5,
        "directors":    1.5,
        "overview":     2.5,
    }

    full_matrix = []
    for col, poids in poids_.items():
        tfidf_ = TfidfVectorizer()
        matrix_ = tfidf_.fit_transform(df[col]) * poids
        full_matrix.append(matrix_)

    combined_matrix = hstack(full_matrix)
    cosine = cosine_similarity(combined_matrix)

    mov_idx = pick_algo(movies, df, algo)
    best_match = idx_titre(mov_idx)
    mov_id = idx_titre_id(mov_idx)

    similar = cosine[mov_idx]
    similar1 = list(enumerate(cosine[mov_idx]))

    sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)
    sim_mov_idx = similar.argsort()[::-1][1:top+1]

    same_movies = df.loc[sim_mov_idx, "titre_str"]
    ttconst = df.loc[sim_mov_idx, "titre_id"]

    sim_scores[1:top+1]
    score = [i[1] for i in sim_scores]
    print()
    print(f"Top 10 similar movies to {best_match} idx {mov_id} are :\n")
    for movies_, tt,  score in zip(same_movies, ttconst, score):
        print(f"Movie : {movies_} | id : {tt} | score : {np.round(score, 4)}")

In [None]:
def knn_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df['one_for_all'])
    mov_idx = pick_algo(movies, df, algo)
    print("movie_index =", mov_idx)

    knn_model = NearestNeighbors(metric='cosine', algorithm='brute').fit(count_matrix)
    dist, indices = knn_model.kneighbors(count_matrix[mov_idx], n_neighbors=top+1)
    print()
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : https://www.imdb.com/title/{idx_titre_id(mov_idx)}")
    poster = f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(mov_idx)}\n"
    print(poster+"*"*len(poster)+"\n\n")
    for index, dis in zip(indices.flatten()[1:], dist.flatten()[1:]):
        cmt = (
            f"Movie : {idx_titre(index)} | popularity {idx_popularity(index)} | score : {np.round(dis, 4)}\n" +
            f"IMdb link : https://www.imdb.com/title/{idx_titre_id(index)}\n"
            f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(index)}\n")
        line = cmt.split('\n')
        print(cmt+"-"*len(line[2]))


In [None]:
# movies = "platform"
# tfidf_algo(dff, movies, algo="knn", top=5)

In [None]:
movies = "platform"
knn_algo(dff, movies, algo="knn", top=5)

In [None]:
def random_forest_algo(df: pd.DataFrame, movies: str, top: int = 5, algo = "knn"):
    mov_idx = pick_algo(movies, df, algo)
    print("movie_index =", mov_idx)

    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df['one_for_all'])

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['titre_genres'])

    rf_model = RandomForestClassifier()
    rf_model.fit(count_matrix, y)

    predicted_genre = rf_model.predict(count_matrix[mov_idx])

    same_genre_idx = np.where(y == predicted_genre[0])[0]

    recommended_indices = np.random.choice(same_genre_idx, size=top, replace=False)

    print()
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : https://www.imdb.com/title/{idx_titre_id(mov_idx)}")
    poster = f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(mov_idx)}\n"
    print(poster+"*"*len(poster)+"\n\n")
    for mov_id in recommended_indices:
        cmt = (
            f"Movie : {idx_titre(mov_id)} | popularity {idx_popularity(mov_id)}" +
            f"IMdb link : https://www.imdb.com/title/{idx_titre_id(mov_id)}\n"
            f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(mov_id)}\n")
        line = cmt.split('\n')
        print(cmt+"-"*len(line[2]))

In [None]:
# movies = "platform"
# random_forest_algo(dff, movies, top=5)