In [1]:

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import pandas as pd
import numpy as np
import hjson
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import process
from unicodedata import normalize, combining
from datetime import datetime

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from get_dataframes import GetDataframes
from tools import import_config, import_datasets, check_titre, color
from scipy.sparse import hstack

pd.set_option('display.max_columns', None)

In [2]:
config = import_config()


In [3]:
df = import_datasets("clean_datasets/tmdb_updated_append.parquet", "parquet")

2023-11-12 20:23:19 INFO     Parquet loaded ! Importing tmdb_updated_append...


In [4]:
df.describe()

Unnamed: 0,budget,id,popularity,release_date,revenue,runtime,vote_average,vote_count
count,3411.0,3411.0,3411.0,3411,3411.0,3411.0,3411.0,3411.0
mean,36939698.51,173607.83,39.86,2006-05-29 22:08:32.928759808,134325866.12,112.65,7.1,3682.87
min,0.0,11.0,0.6,1960-02-05 00:00:00,0.0,6.0,6.3,750.0
25%,3500000.0,6537.5,17.95,1999-03-01 12:00:00,9323216.5,98.0,6.68,1178.0
50%,18000000.0,24238.0,25.12,2009-08-26 00:00:00,48453605.0,110.0,7.04,2009.0
75%,48000000.0,337615.0,39.52,2016-12-13 00:00:00,154363068.0,124.0,7.45,4281.5
max,460000000.0,1151534.0,2820.3,2023-10-25 00:00:00,2923706026.0,233.0,8.71,34719.0
std,51026037.38,229185.35,90.32,,232934354.89,20.87,0.51,4163.29


In [9]:
col = [
    'imdb_id',
    'title',
    # 'adult',
    # 'backdrop_path',
    # 'budget',
    'genres',
    'actors',
    'director',
    'keywords',
    # 'id',
    # 'original_language',
    # 'original_title',
    'overview',
    'popularity',
    # 'production_countries',
    'release_date',
    # 'revenue',
    # 'runtime',
    # 'spoken_languages',
    # 'status',
    # 'tagline',
    # 'video',
    'vote_average',
    'vote_count',
    'url',
    "image",
    "youtube",
    # 'poster_path',
    # 'production_companies_name',
]

In [11]:
df = df[col]

In [12]:
col_rename = {
    "imdb_id": "titre_id",
    "title": "titre_str",
    'genres': "titre_genres",
    'actors': 'actors',
    'director': "director",
    'keywords': "keywords",
    'overview': "overview",
    'popularity': "popularity",
    'release_date': "date",
    'vote_average': "rating_avg",
    'vote_count': "rating_vote",
    'poster_path': "image"
}

In [13]:
df.rename(columns=col_rename, inplace=True)

In [14]:
df.sort_values(by="date", inplace=True)

In [15]:
tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
]
for t in tt:
    df[t] = df[t].apply(
        lambda x: ", ".join(map(str, x))
    ).replace(" ", "")

In [16]:
df["titre_clean"] = df["titre_str"]
df["titre_clean"] = df["titre_clean"].apply(lambda x : x.lower())

In [17]:
def clean_overview(
    text: str
) -> str:
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words('french')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


In [18]:
def supprimer_accents(texte):
    texte_clean = normalize('NFKD', texte)
    return "".join(
        [c for c in texte_clean if not combining(c)]
    )

tt = [
    "actors",
    "titre_genres",
    "keywords",
    "director",
    "titre_clean",
    "overview"
]
for t in tt:
    df[t] = df[t].astype(str).apply(supprimer_accents)

In [19]:
t = df[df["actors"].str.contains("Funes")]
print(t.to_markdown())

|      | titre_id   | titre_str                    | titre_genres             | actors                                         | director     | keywords                                               | overview                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |   popularity | date                |   rating_avg |   rating_vote | url                                  | image                

In [20]:
print("cleaning overview")
df['overview'] = df['overview'].astype(str).apply(clean_overview)

cleaning overview


In [21]:
t = df[df["actors"].str.contains("funes")]
print(t.to_markdown())

| titre_id   | titre_str   | titre_genres   | actors   | director   | keywords   | overview   | popularity   | date   | rating_avg   | rating_vote   | url   | image   | youtube   | titre_clean   |
|------------|-------------|----------------|----------|------------|------------|------------|--------------|--------|--------------|---------------|-------|---------|-----------|---------------|


In [22]:
def full_lower(text: str):
    # return text.replace(" ", "").replace("-", "").lower()
    return text.replace(" ", "").replace("-", "").replace("'", "").replace(":", "").lower()


tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
    "titre_clean",
]
for t in tt:
    print(f"lowering everything in {t}")
    # if "titre_clean" in t:
    #     df[t] = df[t].apply(lambda x : x.replace(":", " "))
    #     tt.remove(t)
    # else:
    df[t] = df[t].apply(full_lower)

lowering everything in actors
lowering everything in titre_genres
lowering everything in director
lowering everything in keywords
lowering everything in titre_clean


In [3]:
name = "machine_learning.parquet"


In [81]:
dfff = df.copy()
dfff.reset_index(drop='index', inplace=True)
dfff.to_parquet(name)

In [5]:
dff = pd.read_parquet(name)

In [6]:
def titre_index(titre: str):
    return dff[dff.titre_str == titre].index[0]

def director_index(director: str):
    return dff[dff.directors.str.contains(director)].index[0]

def actor_index(actor: str):
    return dff[dff.actors.str.contains(actor)].index[0]

def idx_titre(idx: int):
    return dff[dff.index == idx]["titre_str"].values[0]

def idx_actor(idx: int):
    return dff[dff.index == idx]["actors"].values[0]

def idx_titre_id(idx: int):
    return dff[dff.index == idx]["titre_id"].values[0]

def idx_popularity(idx: int):
    return dff[dff.index == idx]["popularity"].values[0]

def idx_keywords(idx: int):
    return dff[dff.index == idx]["keywords"].values[0]

def idx_image(idx: int):
    return dff[dff.index == idx]["image"].values[0]

def idx_youtube(idx: int):
    return dff[dff.index == idx]["youtube"].values[0]

def idx_url(idx: int):
    return dff[dff.index == idx]["url"].values[0]



def check_titre_str(d: pd.DataFrame, movie: str):
    return dff[dff["titre_str"].str.contains(movie)]

In [7]:
check_titre(dff, "ring")

Unnamed: 0,titre_id,titre_str,titre_genres,actors,director,keywords,overview,popularity,date,rating_avg,rating_vote,url,image,youtube,titre_clean
783,tt0178868,Ring,"horreur,thriller","nanakomatsushima,hiroyukisanada,rikiyaotaka",hideonakata,"urbanlegend,videotape",soir seules a maison deux lyceennes font peur ...,20.82,1998-01-30,7.05,1277,https://www.imdb.com/title/tt0178868,https://image.tmdb.org/t/p/w500/yyGZFjVLSBvTqU...,https://www.youtube.com/watch?v=Sd26Z-pF1Ok,ring


In [8]:
def combine(r):
    return (
        r["keywords"]+ " "+
        r["actors"]+" "+
        r["director"]+" "+
        r["overview"]+" "+
        r["titre_genres"]
        # str(r["date"])+" "+
        # str(r["popularity"])+" "+
        # str(r["rating_avg"])+ " "+
        # str(r["rating_vote"])
    )

dff["one_for_all"] = dff.apply(combine, axis=1)

In [9]:
def get_best_match_index_tfidf(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et TfidfVectorizer
    """
    best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_candidate = [match[0] for match in best_match]
    print("best_matches",best_match)
    print("best_candidate",best_candidate)
    print("movies",movies)

    small_df = df[df['titre_clean'].isin(best_candidate)]
    print(small_df[["titre_id", "titre_str", "titre_genres", "keywords"]].to_markdown())
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(best_candidate)
    tfidf_ = tfidf.transform([movies])

    cosine_similarities = cosine_similarity(tfidf_, matrix).flatten()

    best_match_idx = cosine_similarities.argmax()
    best_match_titre = best_candidate[best_match_idx]
    print("best_match_idx",best_match_idx)
    print("best_match_titre",best_match_titre)
    return df[df['titre_clean'] == best_match_titre].index[0]


In [10]:
def get_best_match_index_knn(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et Nearest Neighbors
    """
    # best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_match = process.extractOne(movies, df['titre_clean'].values)
    # best_candidate = [match[0] for match in best_match]
    # print("best_matches :\n",best_match[0])
    # print("best_candidate :\n",best_candidate)
    small_df = df[df["titre_clean"] == best_match[0]]
    # print(small_df[["titre_id", "titre_str", "titre_genres", "keywords"]].to_markdown())
    # small_df = df[df['titre_clean'].isin(best_candidate)]
    return df[df['titre_clean'] == best_match[0]].index[0]
    print(small_df[["titre_id", "titre_str", "titre_genres", "keywords"]].to_markdown())
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(small_df['titre_clean'].values)

    knn = NearestNeighbors(n_neighbors=1).fit(matrix)
    vector = tfidf.transform([movies])

    print("query_vector\n",vector)

    dist, idx = knn.kneighbors(vector, return_distance=True)

    best_match_idx = idx[0][0]
    best_match_titre = small_df['titre_clean'].iloc[best_match_idx]
    print("best_match_idx :",best_match_idx)
    print("best_match_titre :",best_match_titre)
    print()
    return small_df[small_df['titre_clean'] == best_match_titre].index[0]

In [11]:
def get_best_match_index_rf(movies: str, df: pd.DataFrame):
    # Je capte pas le fonctionnement, j'ai besoin de plus de recherche
    raise NotImplementedError
    """
    Utilisation de FuzzyWuzzy et RandomForestClassifier
    """

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(df['titre_clean'].values)
    y = df['titre_clean'].values

    y_encoded = LabelEncoder().fit_transform(y)
    rf = RandomForestClassifier().fit(X, y_encoded)

    vector = tfidf.transform([movies])
    prediction = rf.predict(vector)

    predict = y_encoded.inverse_transform(prediction)[0]
    return df[df['titre_clean'] == predict].index[0]

In [12]:
def pick_algo(movies: str, df: pd.DataFrame, algo: str = "tfidf"):
    movies = movies.replace(" ", "").replace("-", "").replace("'", "").replace(":", "").lower()
    if algo == "tfidf":
        return get_best_match_index_tfidf(movies, df)
    elif algo == "knn":
        return get_best_match_index_knn(movies, df)
    elif algo == "rf":
        return get_best_match_index_rf(movies, df)

In [13]:
def tfidf_algo(df:pd.DataFrame, movies: str, top: int = 10, algo: str = "tfidf"):
    poids_ = {
        "titre_genres": 0.2,
        "actors":       0.15,
        "director":     0.15,
        # "overview":     0.2,
        "keywords":     0.3,
    }

    full_matrix = []
    for col, poids in poids_.items():
        tfidf_ = TfidfVectorizer()
        matrix_ = tfidf_.fit_transform(df[col]) * poids
        full_matrix.append(matrix_)

    combined_matrix = hstack(full_matrix)
    cosine = cosine_similarity(combined_matrix)
    mov_idx = pick_algo(movies, df, algo)

    similar = cosine[mov_idx]
    similar1 = list(enumerate(cosine[mov_idx]))

    sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)
    sim_mov_idx = similar.argsort()[::-1][1:top+1]

    same_movies = df.loc[sim_mov_idx, "titre_str"]

    sim_scores[1:top+1]
    score = [i[1] for i in sim_scores]

    # imdb_url = "https://www.imdb.com/title/"
    # tmdb_image = "https://image.tmdb.org/t/p/w500"
    poster = f"Poster : {idx_image(mov_idx)}\n"

    print(color("~"*len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~"*len(poster), "red"))
    print()
    for movies_, idx in zip(same_movies, sim_mov_idx):
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n" +
            f"IMdb link : {idx_url(idx)}\n" +
            f"Poster : {idx_image(idx)}\n" +
            f"Youtube : {idx_youtube(idx)}\n")
        line = cmt.split('\n')
        print(cmt+color("-"*len(max(line, key=len)), "green"))


In [14]:
def cv_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df['one_for_all'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    mov_idx = pick_algo(movies, df, algo)

    sim_scores = list(enumerate(cosine_sim[mov_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top+1]
    movie_indices = [i[0] for i in sim_scores]

    # imdb_url = "https://www.imdb.com/title/"
    # tmdb_image = "https://image.tmdb.org/t/p/w500"
    poster = f"Poster : {idx_image(mov_idx)}\n"

    print(color("~"*len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~"*len(poster), "red"))
    print()
    for idx in movie_indices:
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n" +
            f"IMdb link : {idx_url(idx)}\n" +
            f"Poster : {idx_image(idx)}\n" +
            f"Youtube : {idx_youtube(idx)}\n")
        line = cmt.split('\n')
        print(cmt+color("-"*len(max(line, key=len)), "green"))

In [15]:
def knn_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df['one_for_all'])
    mov_idx = pick_algo(movies, df, algo)

    knn_model = NearestNeighbors(
        metric='cosine', algorithm='brute').fit(count_matrix
    )
    dist, indices = knn_model.kneighbors(
        count_matrix[mov_idx], n_neighbors=top+1
    )

    # imdb_url = "https://www.imdb.com/title/"
    # tmdb_image = "https://image.tmdb.org/t/p/w500"
    poster = f"Poster : {idx_image(mov_idx)}\n"

    print(color("~"*len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~"*len(poster), "red"))
    print()
    for idx, dis in zip(indices.flatten()[1:], dist.flatten()[1:]):
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n" +
            f"IMdb link : {idx_url(idx)}\n" +
            f"Poster : {idx_image(idx)}\n" +
            f"Youtube : {idx_youtube(idx)}\n")
        line = cmt.split('\n')
        print(cmt+color("-"*len(max(line, key=len)), "green"))

AUTOPLAY : 
https://www.youtube.com/embed/MJ3Up7By5cw?autoplay=1&autohide=2&border=0&wmode=opaque&enablejsapi=1&modestbranding=1&controls=0&showinfo=1&mute=1

In [16]:
movies = "avatar"

In [17]:
tfidf_algo(dff, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Avatar are :
Popularity 127.285
IMdb link : https://www.imdb.com/title/tt0499549
Poster : https://image.tmdb.org/t/p/w500/3npygfmEhqnmNTmDWhHLz1LPcbA.jpg
Youtube : https://www.youtube.com/watch?v=MJ3Up7By5cw
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : Avatar : La Voie de l'eau | popularity 282.732
IMdb link : https://www.imdb.com/title/tt1630029
Poster : https://image.tmdb.org/t/p/w500/hYeB9GpFaT7ysabBoGG5rbo9mF4.jpg
Youtube : https://www.youtube.com/watch?v=Lhnu_kY765M
[38;5;2m------------------------------------------------------------------------[0m
Movie : Un Monde entre Nous | popularity 17.706
IMdb link : https://www.imdb.com/title/tt3922818
Poster : https://image.tmdb.org/t/p/w500/oppCLptTWEzpqFoqlGRbDlcf3HT.jpg
Youtube : 
[38;5;2m------------------------------------------------------------------------[0m
Movie : 

In [18]:
knn_algo(dff, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Avatar are :
Popularity 127.285
IMdb link : https://www.imdb.com/title/tt0499549
Poster : https://image.tmdb.org/t/p/w500/3npygfmEhqnmNTmDWhHLz1LPcbA.jpg
Youtube : https://www.youtube.com/watch?v=MJ3Up7By5cw
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : Avatar : La Voie de l'eau | popularity 282.732
IMdb link : https://www.imdb.com/title/tt1630029
Poster : https://image.tmdb.org/t/p/w500/hYeB9GpFaT7ysabBoGG5rbo9mF4.jpg
Youtube : https://www.youtube.com/watch?v=Lhnu_kY765M
[38;5;2m------------------------------------------------------------------------[0m
Movie : Les Gardiens de la Galaxie : Volume 3 | popularity 284.529
IMdb link : https://www.imdb.com/title/tt6791350
Poster : https://image.tmdb.org/t/p/w500/aaGDsYYjltMxrwgs4qnHse4qlGX.jpg
Youtube : https://www.youtube.com/watch?v=WxA-eZ72FsQ
[38;5;2m-----------------------

In [19]:
cv_algo(dff, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Avatar are :
Popularity 127.285
IMdb link : https://www.imdb.com/title/tt0499549
Poster : https://image.tmdb.org/t/p/w500/3npygfmEhqnmNTmDWhHLz1LPcbA.jpg
Youtube : https://www.youtube.com/watch?v=MJ3Up7By5cw
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : Avatar : La Voie de l'eau | popularity 282.732
IMdb link : https://www.imdb.com/title/tt1630029
Poster : https://image.tmdb.org/t/p/w500/hYeB9GpFaT7ysabBoGG5rbo9mF4.jpg
Youtube : https://www.youtube.com/watch?v=Lhnu_kY765M
[38;5;2m------------------------------------------------------------------------[0m
Movie : Les Gardiens de la Galaxie : Volume 3 | popularity 284.529
IMdb link : https://www.imdb.com/title/tt6791350
Poster : https://image.tmdb.org/t/p/w500/aaGDsYYjltMxrwgs4qnHse4qlGX.jpg
Youtube : https://www.youtube.com/watch?v=WxA-eZ72FsQ
[38;5;2m-----------------------

In [20]:
# random_forest_algo(dff, movies, top=5)