In [73]:

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import pandas as pd
import numpy as np
import hjson
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import process
from unicodedata import normalize, combining
from datetime import datetime

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from get_dataframes import GetDataframes
from tools import import_config, import_datasets, check_titre
from scipy.sparse import hstack

pd.set_option('display.max_columns', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
config = import_config()


In [54]:
df = import_datasets("clean_datasets/tmdb_updated.parquet", "parquet")

2023-11-12 14:24:58 INFO     Parquet loaded ! Importing tmdb_updated...


In [55]:
col = [
    'imdb_id',
    'title',
    # 'adult',
    # 'backdrop_path',
    # 'budget',
    'genres',
    'actors',
    'director',
    'keywords',
    # 'id',
    # 'original_language',
    # 'original_title',
    'overview',
    'popularity',
    # 'production_countries',
    'release_date',
    # 'revenue',
    # 'runtime',
    # 'spoken_languages',
    # 'status',
    # 'tagline',
    # 'video',
    'vote_average',
    'vote_count',
    'poster_path',
    # 'production_companies_name',
]

In [56]:
df = df[col]

In [57]:
col_rename = {
    "imdb_id": "titre_id",
    "title": "titre_str",
    'genres': "titre_genres",
    'actors': 'actors',
    'director': "director",
    'keywords': "keywords",
    'overview': "overview",
    'popularity': "popularity",
    'release_date': "date",
    'vote_average': "rating_avg",
    'vote_count': "rating_vote",
    'poster_path': "image"
}

In [58]:
df.rename(columns=col_rename, inplace=True)

In [65]:
df.sort_values(by="date", inplace=True)

In [None]:
tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
]
for t in tt:
    df[t] = df[t].apply(
        lambda x: ", ".join(map(str, x))
    ).replace(" ", "")

In [69]:
df["titre_clean"] = df["titre_str"]
df["titre_clean"] = df["titre_clean"].apply(lambda x : x.lower())

In [70]:
def clean_overview(
    text: str
) -> str:
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words('french')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


In [71]:
def supprimer_accents(texte):
    texte_clean = normalize('NFKD', texte)
    return "".join(
        [c for c in texte_clean if not combining(c)]
    )

tt = [
    "actors",
    "titre_genres",
    "keywords",
    "director",
    "titre_clean",
]
for t in tt:
    df[t] = df[t].astype(str).apply(supprimer_accents)

In [72]:
print("cleaning overview")
df['overview'] = df['overview'].astype(str).apply(clean_overview)
# print("cleaning titre_clean")
# df['titre_clean'] = df['titre_clean'].astype(str).apply(clean_overview)

cleaning overview


In [74]:
df.to_csv("new_one.csv", index=False)

In [76]:
def full_lower(text: str):
    return text.replace(" ", "").replace("-", "").lower()

tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
    "titre_clean",
]
for t in tt:
    print(f"lowering everything in {t}")
    if "titre_clean" in t:
        df[t] = df[t].apply(lambda x : x.replace(":", " "))
        tt.remove(t)
    else:
        df[t] = df[t].apply(full_lower)

lowering everything in actors
lowering everything in titre_genres
lowering everything in director
lowering everything in keywords
lowering everything in titre_clean


In [97]:
dfff = df.copy()
dfff.reset_index(drop='index', inplace=True)
name = "machine_learning.parquet"
dfff.to_parquet(name)

In [98]:
dff = pd.read_parquet(name)

In [127]:
def titre_index(titre: str):
    return dff[dff.titre_str == titre].index[0]

def director_index(director: str):
    return dff[dff.directors.str.contains(director)].index[0]

def actor_index(actor: str):
    return dff[dff.actors.str.contains(actor)].index[0]

def idx_titre(idx: int):
    return dff[dff.index == idx]["titre_str"].values[0]

def idx_actor(idx: int):
    return dff[dff.index == idx]["actors"].values[0]

def idx_titre_id(idx: int):
    return dff[dff.index == idx]["titre_id"].values[0]

def idx_poster_path(idx: int):
    return dff[dff.index == idx]["image"].values[0]

def idx_popularity(idx: int):
    return dff[dff.index == idx]["popularity"].values[0]

def idx_keywords(idx: int):
    return dff[dff.index == idx]["keywords"].values[0]



def check_titre_str(d: pd.DataFrame, movie: str):
    return dff[dff["titre_str"].str.contains(movie)]

In [128]:
check_titre(dff, "oppenheimer")

Unnamed: 0,titre_id,titre_str,titre_genres,actors,director,keywords,overview,popularity,date,rating_avg,rating_vote,image,titre_clean,one_for_all
3701,tt15398776,Oppenheimer,"drame,histoire","cillianmurphy,emilyblunt,mattdamon,robertdowne...",christophernolan,"basedonnovelorbook,husbandwiferelationship,new...",convaincus allemagne nazie train velopper arme...,986.29,2023-07-19,8.25,4225,/boAUuJBeID7VNp4L7LNMQs8mfQS.jpg,oppenheimer,"drame,histoire christophernolan convaincus all..."


In [129]:
dff.dtypes

titre_id                object
titre_str               object
titre_genres            object
actors                  object
director                object
keywords                object
overview                object
popularity             float64
date            datetime64[ns]
rating_avg             float64
rating_vote              int64
image                   object
titre_clean             object
one_for_all             object
dtype: object

In [177]:
def combine(r):
    return (
        r["titre_genres"]+" "+
        r["director"]+" "+
        r["overview"]+" "+
        r["actors"]+" "+
        # str(r["date"])+" "+
        str(r["popularity"])+" "+
        r["keywords"]+" "+
        str(r["rating_avg"])+ " "+
        str(r["rating_vote"])
    )

dff["one_for_all"] = dff.apply(combine, axis=1)

In [178]:
def get_best_match_index_tfidf(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et TfidfVectorizer
    """
    # movies = movies.lower().replace(" ", "")
    best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_candidate = [match[0] for match in best_match]
    print("best_matches",best_match)
    print("best_candidate",best_candidate)
    print("movies",movies)

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(best_candidate)
    tfidf_ = tfidf.transform([movies])

    cosine_similarities = cosine_similarity(tfidf_, matrix).flatten()

    best_match_idx = cosine_similarities.argmax()
    best_match_titre = best_candidate[best_match_idx]
    print("best_match_idx",best_match_idx)
    print("best_match_titre",best_match_titre)
    return df[df['titre_clean'] == best_match_titre].index[0]


In [179]:
def get_best_match_index_knn(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et Nearest Neighbors
    """
    best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_candidate = [match[0] for match in best_match]
    print("best_matches :\n",best_match)
    print("best_candidate :\n",best_candidate)
    print()

    small_df = df[df['titre_clean'].isin(best_candidate)]
    print(small_df[["titre_id", "titre_str", "titre_genres", "keywords"]].to_markdown())
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(small_df['titre_clean'].values)

    knn = NearestNeighbors(n_neighbors=1).fit(matrix)
    vector = tfidf.transform([movies])

    print("query_vector\n",vector)

    dist, idx = knn.kneighbors(vector, return_distance=True)

    best_match_idx = idx[0][0]
    best_match_titre = small_df['titre_clean'].iloc[best_match_idx]
    print("best_match_idx :",best_match_idx)
    print("best_match_titre :",best_match_titre)
    print()
    return small_df[small_df['titre_clean'] == best_match_titre].index[0]

In [180]:
def get_best_match_index_rf(movies: str, df: pd.DataFrame):
    # Je capte pas le fonctionnement, j'ai besoin de plus de recherche
    raise NotImplementedError
    """
    Utilisation de FuzzyWuzzy et RandomForestClassifier
    """

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(df['titre_clean'].values)
    y = df['titre_clean'].values

    y_encoded = LabelEncoder().fit_transform(y)
    rf = RandomForestClassifier().fit(X, y_encoded)

    vector = tfidf.transform([movies])
    prediction = rf.predict(vector)

    predict = y_encoded.inverse_transform(prediction)[0]
    return df[df['titre_clean'] == predict].index[0]

In [181]:
def pick_algo(movies: str, df: pd.DataFrame, algo: str = "tfidf"):
    if algo == "tfidf":
        return get_best_match_index_tfidf(movies, df)
    elif algo == "knn":
        return get_best_match_index_knn(movies, df)
    elif algo == "rf":
        return get_best_match_index_rf(movies, df)

In [182]:
def tfidf_algo(df:pd.DataFrame, movies: str, top: int = 10, algo: str = "tfidf"):
    poids_ = {
        "titre_genres": 2,
        "actors":       1.5,
        "directors":    1.5,
        "overview":     2.5,
    }

    full_matrix = []
    for col, poids in poids_.items():
        tfidf_ = TfidfVectorizer()
        matrix_ = tfidf_.fit_transform(df[col]) * poids
        full_matrix.append(matrix_)

    combined_matrix = hstack(full_matrix)
    cosine = cosine_similarity(combined_matrix)

    mov_idx = pick_algo(movies, df, algo)
    best_match = idx_titre(mov_idx)
    mov_id = idx_titre_id(mov_idx)

    similar = cosine[mov_idx]
    similar1 = list(enumerate(cosine[mov_idx]))

    sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)
    sim_mov_idx = similar.argsort()[::-1][1:top+1]

    same_movies = df.loc[sim_mov_idx, "titre_str"]
    ttconst = df.loc[sim_mov_idx, "titre_id"]

    sim_scores[1:top+1]
    score = [i[1] for i in sim_scores]
    print()
    print(f"Top 10 similar movies to {best_match} idx {mov_id} are :\n")
    for movies_, tt,  score in zip(same_movies, ttconst, score):
        print(f"Movie : {movies_} | id : {tt} | score : {np.round(score, 4)}")

In [183]:
def knn_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df['one_for_all'])
    mov_idx = pick_algo(movies, df, algo)
    print("movie_index =", mov_idx)

    knn_model = NearestNeighbors(metric='cosine', algorithm='brute').fit(count_matrix)
    dist, indices = knn_model.kneighbors(count_matrix[mov_idx], n_neighbors=top+1)
    print()
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : https://www.imdb.com/title/{idx_titre_id(mov_idx)}")
    poster = f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(mov_idx)}\n"
    print(poster+"*"*len(poster)+"\n\n")
    for index, dis in zip(indices.flatten()[1:], dist.flatten()[1:]):
        cmt = (
            f"Movie : {idx_titre(index)} | popularity {idx_popularity(index)} | score : {np.round(dis, 4)}\n" +
            f"IMdb link : https://www.imdb.com/title/{idx_titre_id(index)}\n"
            f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(index)}\n")
        line = cmt.split('\n')
        print(cmt+"-"*len(line[2]))


In [184]:
# movies = "platform"
# tfidf_algo(dff, movies, algo="knn", top=5)

In [None]:
# basedonnovelorbook,supportgroup,dualidentity,nihilism,fight

In [189]:
movies = "La ligne verte"
knn_algo(dff, movies, algo="knn", top=5)

best_matches :
 [('la ligne verte', 100), ('la melodie du bonheur', 86), ('le bon, la brute et le truand', 86), ('le livre de la jungle', 86), ('la planete des singes', 86), ('la nuit des morts-vivants', 86), ('le secret de la planete des singes', 86), ('charlie et la chocolaterie', 86), ('massacre a la tronconneuse', 86), ('josey wales   hors-la-loi', 86)]
best_candidate :
 ['la ligne verte', 'la melodie du bonheur', 'le bon, la brute et le truand', 'le livre de la jungle', 'la planete des singes', 'la nuit des morts-vivants', 'le secret de la planete des singes', 'charlie et la chocolaterie', 'massacre a la tronconneuse', 'josey wales   hors-la-loi']

|      | titre_id   | titre_str                          | titre_genres                            | keywords                                                                          |
|-----:|:-----------|:-----------------------------------|:----------------------------------------|:----------------------------------------------------

In [186]:
def random_forest_algo(df: pd.DataFrame, movies: str, top: int = 5, algo = "knn"):
    mov_idx = pick_algo(movies, df, algo)
    print("movie_index =", mov_idx)

    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df['one_for_all'])

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['titre_genres'])

    rf_model = RandomForestClassifier()
    rf_model.fit(count_matrix, y)

    predicted_genre = rf_model.predict(count_matrix[mov_idx])

    same_genre_idx = np.where(y == predicted_genre[0])[0]

    recommended_indices = np.random.choice(same_genre_idx, size=top, replace=False)

    print()
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : https://www.imdb.com/title/{idx_titre_id(mov_idx)}")
    poster = f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(mov_idx)}\n"
    print(poster+"*"*len(poster)+"\n\n")
    for mov_id in recommended_indices:
        cmt = (
            f"Movie : {idx_titre(mov_id)} | popularity {idx_keywords(mov_id)}" +
            f"IMdb link : https://www.imdb.com/title/{idx_titre_id(mov_id)}\n"
            f"Poster : https://image.tmdb.org/t/p/w500{idx_poster_path(mov_id)}\n")
        line = cmt.split('\n')
        print(cmt+"-"*len(line[2]))

In [187]:
# movies = "platform"
# random_forest_algo(dff, movies, top=5)