In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import pandas as pd
import numpy as np
import hjson
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import process
from unicodedata import normalize, combining
from datetime import datetime, timedelta

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from get_dataframes import GetDataframes
from tools import import_config, import_datasets, check_titre, color
from scipy.sparse import hstack

pd.set_option("display.max_columns", None)



In [2]:
config = import_config()

In [3]:
df = import_datasets("clean_datasets/tmdb_updated_append.parquet", "parquet")

2023-11-14 16:13:54 INFO     Parquet loaded ! Importing tmdb_updated_append...


In [4]:
df.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,3414.0,3414.0,3414.0,3414.0,3414.0,3414.0,3414.0
mean,36928474.4,173682.62,38.43,134269410.41,112.64,7.1,3684.43
std,51014070.04,229146.23,84.52,232858182.73,20.87,0.51,4164.2
min,0.0,11.0,0.6,0.0,6.0,6.3,750.0
25%,3500000.0,6542.75,16.18,9263652.25,98.0,6.67,1178.0
50%,18000000.0,24336.5,23.51,48452704.0,110.0,7.04,2011.5
75%,48000000.0,337644.5,37.77,154531534.0,124.0,7.45,4288.0
max,460000000.0,1151534.0,2219.35,2923706026.0,233.0,8.71,34735.0


In [5]:
col = [
    "imdb_id",
    "title",
    # 'adult',
    # 'backdrop_path',
    # 'budget',
    "genres",
    "actors",
    "director",
    "keywords",
    "id",
    # 'original_language',
    # 'original_title',
    "overview",
    "popularity",
    # 'production_countries',
    "release_date",
    # 'revenue',
    # 'runtime',
    # 'spoken_languages',
    # 'status',
    # 'tagline',
    # 'video',
    "vote_average",
    "vote_count",
    "url",
    "image",
    "youtube",
    # 'poster_path',
    # 'production_companies_name',
]

In [6]:
df = df[col]

In [7]:
col_rename = {
    "imdb_id": "titre_id",
    "id": "tmdb_id",
    "title": "titre_str",
    "genres": "titre_genres",
    "actors": "actors",
    "director": "director",
    "keywords": "keywords",
    "overview": "overview",
    "popularity": "popularity",
    "release_date": "date",
    "vote_average": "rating_avg",
    "vote_count": "rating_vote",
    "poster_path": "image",
}

In [8]:
df.rename(columns=col_rename, inplace=True)

In [9]:
df.sort_values(by="date", inplace=True)

In [10]:
tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
]
for t in tt:
    df[t] = df[t].apply(lambda x: ", ".join(map(str, x))).replace(" ", "")

In [11]:
df["titre_clean"] = df["titre_str"]
df["titre_clean"] = df["titre_clean"].apply(lambda x: x.lower())

In [12]:
df

Unnamed: 0,titre_id,titre_str,titre_genres,actors,director,keywords,tmdb_id,overview,popularity,date,rating_avg,rating_vote,url,image,youtube,titre_clean
320,tt0053779,La dolce vita,"Comédie, Drame","Marcello Mastroianni, Anita Ekberg, Anouk Aimée",Federico Fellini,"rome, italy, sea, loss of loved one, lovesickn...",439,Le chroniqueur Marcello fait le tour des lieux...,21.65,1960-02-05,8.13,1697,https://www.imdb.com/title/tt0053779,https://image.tmdb.org/t/p/w500/dAIN4mz33ZMReI...,https://www.youtube.com/watch?v=BtrtW0SoYOg,la dolce vita
211,tt0053472,À bout de souffle,"Drame, Crime","Jean-Paul Belmondo, Jean Seberg",Jean-Luc Godard,"paris, france, loss of loved one, journalist, ...",269,Un petit escroc vole une voiture et assassine ...,15.07,1960-03-16,7.57,1644,https://www.imdb.com/title/tt0053472,https://image.tmdb.org/t/p/w500/iqnXXdyVKPuMng...,https://www.youtube.com/watch?v=MmDDJWSNr9E,à bout de souffle
225,tt0053604,La Garçonnière,"Comédie, Drame, Romance","Jack Lemmon, Shirley MacLaine, Fred MacMurray",Billy Wilder,"new york city, new year's eve, lovesickness, a...",284,"C.C. Baxter est employé à la Sauvegarde, grand...",21.58,1960-06-21,8.20,2052,https://www.imdb.com/title/tt0053604,https://image.tmdb.org/t/p/w500/puRnwIQd6VdQqe...,https://www.youtube.com/watch?v=qk9UMLzhq2g,la garçonnière
382,tt0054215,Psychose,"Horreur, Drame, Thriller","Anthony Perkins, Janet Leigh, Vera Miles",Alfred Hitchcock,"hotel, clerk, arizona, shower, motel, stolen m...",539,Marion Crane en a assez de ne pouvoir mener sa...,44.03,1960-06-22,8.44,9388,https://www.imdb.com/title/tt0054215,https://image.tmdb.org/t/p/w500/wVxZDGjd3rCl6y...,https://www.youtube.com/watch?v=BSuMxSyyKlU,psychose
726,tt0054047,Les Sept Mercenaires,"Action, Aventure, Western","Yul Brynner, Eli Wallach, Steve McQueen",John Sturges,"horse, village, friendship, remake, bandit, fa...",966,Un petit village de paysans du nord du Mexique...,31.36,1960-10-12,7.51,1607,https://www.imdb.com/title/tt0054047,https://image.tmdb.org/t/p/w500/4EJqOoRu9CtnbV...,,les sept mercenaires
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2038,tt22687790,Mystère à Venise,"Mystère, Thriller, Crime","Kenneth Branagh, Kyle Allen, Camille Cottin",Kenneth Branagh,"venice, italy, loss of loved one, detective, b...",945729,Après la Seconde Guerre mondiale en une sombre...,255.25,2023-09-13,6.83,1084,https://www.imdb.com/title/tt22687790,https://image.tmdb.org/t/p/w500/wQdOoevcY3cYDy...,https://www.youtube.com/watch?v=wP8wvBIB_Rc,mystère à venise
850,tt21807222,Saw X,"Horreur, Thriller","Tobin Bell, Shawnee Smith, Synnøve Macody Lund",Kevin Greutert,"riddle, mexico city, mexico, sadism, sequel, g...",951491,"Dans l'espoir d'une guérison miraculeuse, John...",867.21,2023-09-26,7.40,1028,https://www.imdb.com/title/tt21807222,https://image.tmdb.org/t/p/w500/u7Lp1Hi8aBS73j...,https://www.youtube.com/watch?v=Q4KyAhPmjU8,saw x
2403,tt15789472,Nowhere,"Thriller, Drame","Anna Castillo, Tamar Novas, Irina Bravo",Albert Pintó,"lost at sea, one location, survival at sea",1151534,"Enceinte, Mia fuit avec son mari un pays total...",450.30,2023-09-29,7.58,832,https://www.imdb.com/title/tt15789472,https://image.tmdb.org/t/p/w500/pPmFLClCQakJKQ...,https://www.youtube.com/watch?v=Mcu-62zd4TU,nowhere
3282,tt5537002,Killers of the Flower Moon,"Crime, Drame, Histoire","Leonardo DiCaprio, Lily Gladstone, Robert De Niro",Martin Scorsese,"based on novel or book, husband wife relations...",466420,L'histoire vraie des meurtres de plusieurs mem...,231.85,2023-10-18,7.73,901,https://www.imdb.com/title/tt5537002,https://image.tmdb.org/t/p/w500/fLVZcHRud9e8yI...,https://www.youtube.com/watch?v=LQOlGbj0ZPs,killers of the flower moon


In [13]:
def clean_overview(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words("french")]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [14]:
def supprimer_accents(texte):
    texte_clean = normalize("NFKD", texte)
    return "".join([c for c in texte_clean if not combining(c)])


tt = [
    "actors",
    "titre_genres",
    "keywords",
    "director",
    "titre_clean",
    "overview",
]
for t in tt:
    df[t] = df[t].astype(str).apply(supprimer_accents)

In [15]:
t = df[df["actors"].str.contains("Funes")]
print(t.to_markdown())

|      | titre_id   | titre_str                    | titre_genres             | actors                                         | director     | keywords                                                                                                                                   |   tmdb_id | overview                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |   popularity | date             

In [16]:
print("cleaning overview")
df["overview"] = df["overview"].astype(str).apply(clean_overview)

cleaning overview


In [17]:
def full_lower(text: str):
    # return text.replace(" ", "").replace("-", "").lower()
    return (
        text.replace(" ", "")
        .replace("-", "")
        .replace("'", "")
        .replace(":", "")
        .lower()
    )


tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
    "titre_clean",
]
for t in tt:
    print(f"lowering everything in {t}")
    # if "titre_clean" in t:
    #     df[t] = df[t].apply(lambda x : x.replace(":", " "))
    #     tt.remove(t)
    # else:
    df[t] = df[t].apply(full_lower)

lowering everything in actors
lowering everything in titre_genres
lowering everything in director
lowering everything in keywords
lowering everything in titre_clean


In [18]:
name = "machine_learning.parquet"

In [19]:
dfff = df.copy()
dfff.reset_index(drop="index", inplace=True)
dfff.to_parquet(name)

In [20]:
name = "machine_learning.parquet"

dff = pd.read_parquet(name)

In [21]:
dff

Unnamed: 0,titre_id,titre_str,titre_genres,actors,director,keywords,tmdb_id,overview,popularity,date,rating_avg,rating_vote,url,image,youtube,titre_clean
0,tt0053779,La dolce vita,"comedie,drame","marcellomastroianni,anitaekberg,anoukaimee",federicofellini,"rome,italy,sea,lossoflovedone,lovesickness,sun...",439,chroniqueur marcello fait tour lieux a scandal...,21.65,1960-02-05,8.13,1697,https://www.imdb.com/title/tt0053779,https://image.tmdb.org/t/p/w500/dAIN4mz33ZMReI...,https://www.youtube.com/watch?v=BtrtW0SoYOg,ladolcevita
1,tt0053472,À bout de souffle,"drame,crime","jeanpaulbelmondo,jeanseberg",jeanlucgodard,"paris,france,lossoflovedone,journalist,carthie...",269,petit escroc vole voiture assassine impulsivem...,15.07,1960-03-16,7.57,1644,https://www.imdb.com/title/tt0053472,https://image.tmdb.org/t/p/w500/iqnXXdyVKPuMng...,https://www.youtube.com/watch?v=MmDDJWSNr9E,aboutdesouffle
2,tt0053604,La Garçonnière,"comedie,drame,romance","jacklemmon,shirleymaclaine,fredmacmurray",billywilder,"newyorkcity,newyearseve,lovesickness,agediffer...",284,baxter employe a sauvegarde grande compagnie a...,21.58,1960-06-21,8.20,2052,https://www.imdb.com/title/tt0053604,https://image.tmdb.org/t/p/w500/puRnwIQd6VdQqe...,https://www.youtube.com/watch?v=qk9UMLzhq2g,lagarconniere
3,tt0054215,Psychose,"horreur,drame,thriller","anthonyperkins,janetleigh,veramiles",alfredhitchcock,"hotel,clerk,arizona,shower,motel,stolenmoney,t...",539,marion crane a assez pouvoir mener vie comme e...,44.03,1960-06-22,8.44,9388,https://www.imdb.com/title/tt0054215,https://image.tmdb.org/t/p/w500/wVxZDGjd3rCl6y...,https://www.youtube.com/watch?v=BSuMxSyyKlU,psychose
4,tt0054047,Les Sept Mercenaires,"action,aventure,western","yulbrynner,eliwallach,stevemcqueen",johnsturges,"horse,village,friendship,remake,bandit,farmer,...",966,petit village paysans nord mexique subit raid ...,31.36,1960-10-12,7.51,1607,https://www.imdb.com/title/tt0054047,https://image.tmdb.org/t/p/w500/4EJqOoRu9CtnbV...,,lesseptmercenaires
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3409,tt22687790,Mystère à Venise,"mystere,thriller,crime","kennethbranagh,kyleallen,camillecottin",kennethbranagh,"venice,italy,lossoflovedone,detective,basedonn...",945729,apres seconde guerre mondiale sombre veille to...,255.25,2023-09-13,6.83,1084,https://www.imdb.com/title/tt22687790,https://image.tmdb.org/t/p/w500/wQdOoevcY3cYDy...,https://www.youtube.com/watch?v=wP8wvBIB_Rc,mystereavenise
3410,tt21807222,Saw X,"horreur,thriller","tobinbell,shawneesmith,synnøvemacodylund",kevingreutert,"riddle,mexicocity,mexico,sadism,sequel,gore,co...",951491,espoir guerison miraculeuse john kramer rend m...,867.21,2023-09-26,7.40,1028,https://www.imdb.com/title/tt21807222,https://image.tmdb.org/t/p/w500/u7Lp1Hi8aBS73j...,https://www.youtube.com/watch?v=Q4KyAhPmjU8,sawx
3411,tt15789472,Nowhere,"thriller,drame","annacastillo,tamarnovas,irinabravo",albertpinto,"lostatsea,onelocation,survivalatsea",1151534,enceinte mia fuit mari pay totalitaire cachant...,450.30,2023-09-29,7.58,832,https://www.imdb.com/title/tt15789472,https://image.tmdb.org/t/p/w500/pPmFLClCQakJKQ...,https://www.youtube.com/watch?v=Mcu-62zd4TU,nowhere
3412,tt5537002,Killers of the Flower Moon,"crime,drame,histoire","leonardodicaprio,lilygladstone,robertdeniro",martinscorsese,"basedonnovelorbook,husbandwiferelationship,war...",466420,histoire vraie meurtres plusieurs membres trib...,231.85,2023-10-18,7.73,901,https://www.imdb.com/title/tt5537002,https://image.tmdb.org/t/p/w500/fLVZcHRud9e8yI...,https://www.youtube.com/watch?v=LQOlGbj0ZPs,killersoftheflowermoon


In [22]:
def titre_index(titre: str):
    return dff[dff.titre_str == titre].index[0]


def director_index(director: str):
    return dff[dff.directors.str.contains(director)].index[0]


def actor_index(actor: str):
    return dff[dff.actors.str.contains(actor)].index[0]


def idx_titre(idx: int):
    return dff[dff.index == idx]["titre_str"].values[0]


def idx_actor(idx: int):
    return dff[dff.index == idx]["actors"].values[0]


def idx_titre_id(idx: int):
    return dff[dff.index == idx]["titre_id"].values[0]


def idx_popularity(idx: int):
    return dff[dff.index == idx]["popularity"].values[0]


def idx_keywords(idx: int):
    return dff[dff.index == idx]["keywords"].values[0]


def idx_image(idx: int):
    return dff[dff.index == idx]["image"].values[0]


def idx_youtube(idx: int):
    return dff[dff.index == idx]["youtube"].values[0]


def idx_url(idx: int):
    return dff[dff.index == idx]["url"].values[0]


def check_titre_str(d: pd.DataFrame, movie: str):
    return dff[dff["titre_str"].str.contains(movie)]

In [23]:
check_titre(dff, "ring")

Unnamed: 0,titre_id,titre_str,titre_genres,actors,director,keywords,tmdb_id,overview,popularity,date,rating_avg,rating_vote,url,image,youtube,titre_clean
783,tt0178868,Ring,"horreur,thriller","nanakomatsushima,hiroyukisanada,rikiyaotaka",hideonakata,"urbanlegend,videotape",2671,soir seules a maison deux lyceennes font peur ...,19.06,1998-01-30,7.05,1279,https://www.imdb.com/title/tt0178868,https://image.tmdb.org/t/p/w500/yyGZFjVLSBvTqU...,https://www.youtube.com/watch?v=Sd26Z-pF1Ok,ring


In [24]:
dff["date_only"] = dff["date"].dt.year

In [25]:
def combine(r):
    return (
        r["keywords"]
        + " "
        + r["actors"]
        + " "
        + r["director"]
        + " "
        +
        # r["overview"]+" "+
        r["titre_genres"]
        # str(r["date_only"])
        # str(r["popularity"])
        # str(r["rating_avg"])+ " "+
        # str(r["rating_vote"])
    )


dff["one_for_all"] = dff.apply(combine, axis=1)

In [26]:
def get_best_match_index_tfidf(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et TfidfVectorizer
    """
    best_match = process.extract(movies, df["titre_clean"].values, limit=10)
    best_candidate = [match[0] for match in best_match]
    print("best_matches", best_match)
    print("best_candidate", best_candidate)
    print("movies", movies)

    small_df = df[df["titre_clean"].isin(best_candidate)]
    print(
        small_df[
            ["titre_id", "titre_str", "titre_genres", "keywords"]
        ].to_markdown()
    )
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(best_candidate)
    tfidf_ = tfidf.transform([movies])

    cosine_similarities = cosine_similarity(tfidf_, matrix).flatten()

    best_match_idx = cosine_similarities.argmax()
    best_match_titre = best_candidate[best_match_idx]
    print("best_match_idx", best_match_idx)
    print("best_match_titre", best_match_titre)
    return df[df["titre_clean"] == best_match_titre].index[0]

In [27]:
def get_best_match_index_knn(movies: str, df: pd.DataFrame):
    """
    Utilisation de FuzzyWuzzy et Nearest Neighbors
    """
    # best_match = process.extract(movies, df['titre_clean'].values, limit=10)
    best_match = process.extractOne(movies, df["titre_clean"].values)
    # best_candidate = [match[0] for match in best_match]
    # print("best_matches :\n",best_match[0])
    # print("best_candidate :\n",best_candidate)
    small_df = df[df["titre_clean"] == best_match[0]]
    # print(small_df[["titre_id", "titre_str", "titre_genres", "keywords"]].to_markdown())
    # small_df = df[df['titre_clean'].isin(best_candidate)]
    return df[df["titre_clean"] == best_match[0]].index[0]
    print(
        small_df[
            ["titre_id", "titre_str", "titre_genres", "keywords"]
        ].to_markdown()
    )
    print()

    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(small_df["titre_clean"].values)

    knn = NearestNeighbors(n_neighbors=1).fit(matrix)
    vector = tfidf.transform([movies])

    print("query_vector\n", vector)

    dist, idx = knn.kneighbors(vector, return_distance=True)

    best_match_idx = idx[0][0]
    best_match_titre = small_df["titre_clean"].iloc[best_match_idx]
    print("best_match_idx :", best_match_idx)
    print("best_match_titre :", best_match_titre)
    print()
    return small_df[small_df["titre_clean"] == best_match_titre].index[0]

In [28]:
def get_best_match_index_rf(movies: str, df: pd.DataFrame):
    # Je capte pas le fonctionnement, j'ai besoin de plus de recherche
    raise NotImplementedError
    """
    Utilisation de FuzzyWuzzy et RandomForestClassifier
    """

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(df["titre_clean"].values)
    y = df["titre_clean"].values

    y_encoded = LabelEncoder().fit_transform(y)
    rf = RandomForestClassifier().fit(X, y_encoded)

    vector = tfidf.transform([movies])
    prediction = rf.predict(vector)

    predict = y_encoded.inverse_transform(prediction)[0]
    return df[df["titre_clean"] == predict].index[0]

In [29]:
def pick_algo(movies: str, df: pd.DataFrame, algo: str = "tfidf"):
    movies = (
        movies.replace(" ", "")
        .replace("-", "")
        .replace("'", "")
        .replace(":", "")
        .lower()
    )
    if algo == "tfidf":
        return get_best_match_index_tfidf(movies, df)
    elif algo == "knn":
        return get_best_match_index_knn(movies, df)
    elif algo == "rf":
        return get_best_match_index_rf(movies, df)

In [30]:
def tfidf_algo(
    df: pd.DataFrame, movies: str, top: int = 10, algo: str = "tfidf"
):
    poids_ = {
        "titre_genres": 0.2,
        "actors": 0.15,
        "director": 0.15,
        "overview": 0.2,
        "keywords": 0.3,
    }

    full_matrix = []
    for col, poids in poids_.items():
        tfidf_ = TfidfVectorizer()
        matrix_ = tfidf_.fit_transform(df[col]) * poids
        full_matrix.append(matrix_)

    combined_matrix = hstack(full_matrix)
    cosine = cosine_similarity(combined_matrix)
    mov_idx = pick_algo(movies, df, algo)

    similar = cosine[mov_idx]
    similar1 = list(enumerate(cosine[mov_idx]))

    sim_scores = sorted(similar1, key=lambda x: x[1], reverse=True)
    sim_mov_idx = similar.argsort()[::-1][1 : top + 1]

    same_movies = df.loc[sim_mov_idx, "titre_str"]

    sim_scores[1 : top + 1]
    score = [i[1] for i in sim_scores]

    # imdb_url = "https://www.imdb.com/title/"
    # tmdb_image = "https://image.tmdb.org/t/p/w500"
    poster = f"Poster : {idx_image(mov_idx)}\n"

    print(color("~" * len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~" * len(poster), "red"))
    print()
    for movies_, idx in zip(same_movies, sim_mov_idx):
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n"
            + f"IMdb link : {idx_url(idx)}\n"
            + f"Poster : {idx_image(idx)}\n"
            + f"Youtube : {idx_youtube(idx)}\n"
        )
        line = cmt.split("\n")
        print(cmt + color("-" * len(max(line, key=len)), "green"))

In [31]:
def cv_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df["one_for_all"])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    mov_idx = pick_algo(movies, df, algo)

    sim_scores = list(enumerate(cosine_sim[mov_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1 : top + 1]
    movie_indices = [i[0] for i in sim_scores]

    # imdb_url = "https://www.imdb.com/title/"
    # tmdb_image = "https://image.tmdb.org/t/p/w500"
    poster = f"Poster : {idx_image(mov_idx)}\n"

    print(color("~" * len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~" * len(poster), "red"))
    print()
    for idx in movie_indices:
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n"
            + f"IMdb link : {idx_url(idx)}\n"
            + f"Poster : {idx_image(idx)}\n"
            + f"Youtube : {idx_youtube(idx)}\n"
        )
        line = cmt.split("\n")
        print(cmt + color("-" * len(max(line, key=len)), "green"))

In [32]:
def knn_algo(df: pd.DataFrame, movies: str, top: int = 5, algo: str = "knn"):
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(df["one_for_all"])
    mov_idx = pick_algo(movies, df, algo)

    knn_model = NearestNeighbors(metric="cosine", algorithm="brute").fit(
        count_matrix
    )
    dist, indices = knn_model.kneighbors(
        count_matrix[mov_idx], n_neighbors=top + 1
    )

    poster = f"Poster : {idx_image(mov_idx)}\n"
    print(color("~" * len(poster), "red"))
    print(f"Top 10 similar movies to {idx_titre(mov_idx)} are :")
    print(f"Popularity {idx_popularity(mov_idx)}")
    print(f"IMdb link : {idx_url(mov_idx)}")
    print(f"Poster : {idx_image(mov_idx)}")
    print(f"Youtube : {idx_youtube(mov_idx)}")
    print(color("~" * len(poster), "red"))
    print()
    for idx, dis in zip(indices.flatten()[1:], dist.flatten()[1:]):
        cmt = (
            f"Movie : {idx_titre(idx)} | popularity {idx_popularity(idx)}\n"
            + f"IMdb link : {idx_url(idx)}\n"
            + f"Poster : {idx_image(idx)}\n"
            + f"Youtube : {idx_youtube(idx)}\n"
        )
        line = cmt.split("\n")
        print(cmt + color("-" * len(max(line, key=len)), "green"))

AUTOPLAY : 
https://www.youtube.com/embed/MJ3Up7By5cw?autoplay=1&autohide=2&border=0&wmode=opaque&enablejsapi=1&modestbranding=1&controls=0&showinfo=1&mute=1

In [33]:
movies = "fight club"

In [34]:
tfidf_algo(dff, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Fight Club are :
Popularity 78.346
IMdb link : https://www.imdb.com/title/tt0137523
Poster : https://image.tmdb.org/t/p/w500/t1i10ptOivG4hV7erkX3tmKpiqm.jpg
Youtube : https://www.youtube.com/watch?v=tZpXdiB_pg0
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : The Social Network | popularity 42.296
IMdb link : https://www.imdb.com/title/tt1285016
Poster : https://image.tmdb.org/t/p/w500/cvUfwhoAReL4e5eegFCHM73rIda.jpg
Youtube : https://www.youtube.com/watch?v=SbQ19-ePch8
[38;5;2m------------------------------------------------------------------------[0m
Movie : Cake | popularity 18.407
IMdb link : https://www.imdb.com/title/tt3442006
Poster : https://image.tmdb.org/t/p/w500/dAo1FQVCav0kxUoMpAltIvQwaEj.jpg
Youtube : https://www.youtube.com/watch?v=Dv0QEJq6cLw
[38;5;2m--------------------------------------------------------------

In [35]:
knn_algo(dff, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Fight Club are :
Popularity 78.346
IMdb link : https://www.imdb.com/title/tt0137523
Poster : https://image.tmdb.org/t/p/w500/t1i10ptOivG4hV7erkX3tmKpiqm.jpg
Youtube : https://www.youtube.com/watch?v=tZpXdiB_pg0
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : Seven | popularity 62.046
IMdb link : https://www.imdb.com/title/tt0114369
Poster : https://image.tmdb.org/t/p/w500/h4X4tJFxJobAImCMekVZXUpJVJC.jpg
Youtube : https://www.youtube.com/watch?v=ObOexjW0dgY
[38;5;2m------------------------------------------------------------------------[0m
Movie : Brooklyn Affairs | popularity 19.256
IMdb link : https://www.imdb.com/title/tt0385887
Poster : https://image.tmdb.org/t/p/w500/lI0pdnSsyQvWcJa1k1mFMFZNbMY.jpg
Youtube : https://www.youtube.com/watch?v=yeAItVQa6ok
[38;5;2m---------------------------------------------------------------

In [36]:
cv_algo(dff, movies, algo="knn", top=5)

[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m
Top 10 similar movies to Fight Club are :
Popularity 78.346
IMdb link : https://www.imdb.com/title/tt0137523
Poster : https://image.tmdb.org/t/p/w500/t1i10ptOivG4hV7erkX3tmKpiqm.jpg
Youtube : https://www.youtube.com/watch?v=tZpXdiB_pg0
[38;5;1m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m

Movie : Seven | popularity 62.046
IMdb link : https://www.imdb.com/title/tt0114369
Poster : https://image.tmdb.org/t/p/w500/h4X4tJFxJobAImCMekVZXUpJVJC.jpg
Youtube : https://www.youtube.com/watch?v=ObOexjW0dgY
[38;5;2m------------------------------------------------------------------------[0m
Movie : Brooklyn Affairs | popularity 19.256
IMdb link : https://www.imdb.com/title/tt0385887
Poster : https://image.tmdb.org/t/p/w500/lI0pdnSsyQvWcJa1k1mFMFZNbMY.jpg
Youtube : https://www.youtube.com/watch?v=yeAItVQa6ok
[38;5;2m---------------------------------------------------------------

In [37]:
# random_forest_algo(dff, movies, top=5)