In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import pandas as pd
import numpy as np
import hjson
import re
import matplotlib.pyplot as plt
from fuzzywuzzy import process
from unicodedata import normalize, combining
from datetime import datetime, timedelta

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from get_dataframes import GetDataframes
from tools import import_config, import_datasets, check_titre, color
from scipy.sparse import hstack

pd.set_option("display.max_columns", None)

In [2]:
config = import_config()

In [3]:
df = import_datasets("clean_datasets/machine_learning.parquet", "parquet")
df = df.copy()

2023-11-15 15:36:59 INFO     Parquet loaded ! Importing machine_learning...


In [4]:
col = [
    "imdb_id",
    "title",
    # 'adult',
    # 'backdrop_path',
    # 'budget',
    "genres",
    "actors",
    "director",
    "keywords",
    "id",
    # 'original_language',
    # 'original_title',
    "overview",
    "popularity",
    # 'production_countries',
    "release_date",
    # 'revenue',
    # 'runtime',
    # 'spoken_languages',
    # 'status',
    # 'tagline',
    # 'video',
    "vote_average",
    "vote_count",
    "url",
    "image",
    "youtube",
    # 'poster_path',
    # 'production_companies_name',
]

In [5]:
df = df[col]

In [6]:
df.sort_values(by="title")

Unnamed: 0,imdb_id,title,genres,actors,director,keywords,id,overview,popularity,release_date,vote_average,vote_count,url,image,youtube
236,tt10620868,#Alive,"[Horreur, Action, Aventure, Thriller]","[Yoo Ah-in, Park Shin-hye, Lee Hyun-wook]",[Cho Il],"[alone, survival, escape, drone, zombie, apart...",614696,La propagation d'un mystérieux virus plonge un...,25.70,2020-06-24,7.29,1595,https://www.imdb.com/title/tt10620868,https://image.tmdb.org/t/p/w500/cZ9DYfSY68PVw7...,https://www.youtube.com/watch?v=RZC1Zk6HlGk
3339,tt2883512,#Chef,[Comédie],"[Jon Favreau, Sofía Vergara, Emjay Anthony]",[Jon Favreau],"[parent child relationship, restaurant owner, ...",212778,"Carl Casper, Chef cuisinier, préfère démission...",22.99,2014-05-08,7.12,3034,https://www.imdb.com/title/tt2883512,https://image.tmdb.org/t/p/w500/9w7DrlzxxOWJRr...,https://www.youtube.com/watch?v=yrdZD4Efkxc
2545,tt6119504,#realityhigh,[Comédie],"[Nesta Cooper, Keith Powers, Alicia Sanz]",[Fernando Lebrija],"[nerd, high school, teenage crush, social media]",455656,Quand une élève brillante attire enfin l'atten...,14.87,2017-07-17,6.31,997,https://www.imdb.com/title/tt6119504,https://image.tmdb.org/t/p/w500/iZliPeiiDta9Kb...,https://www.youtube.com/watch?v=Yn03DOeuj4o
2919,tt2614684,'71,"[Thriller, Action, Drame, Guerre]","[Jack O'Connell, Sean Harris, Paul Anderson]",[Yann Demange],"[1970s, riot, northern ireland, survival, sold...",252178,"Belfast, 1971. Tandis que le conflit dégénère ...",12.64,2014-10-10,6.79,1035,https://www.imdb.com/title/tt2614684,https://image.tmdb.org/t/p/w500/9KWEosDTqbFMAW...,https://www.youtube.com/watch?v=0Jlmf1-f9Y0
2001,tt1022603,(500) jours ensemble,"[Comédie, Drame, Romance]","[Joseph Gordon-Levitt, Zooey Deschanel, Geoffr...",[Marc Webb],"[date, jealousy, fight, architect, gallery, in...",19913,"Tom croit encore en un amour qui transfigure, ...",37.10,2009-07-17,7.28,9433,https://www.imdb.com/title/tt1022603,https://image.tmdb.org/t/p/w500/Au6ac2MEQuka3D...,https://www.youtube.com/watch?v=DLbCGDcPr5Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350,tt0093565,Éclair de lune,"[Comédie, Drame, Romance]","[Cher, Nicolas Cage, Vincent Gardenia]",[Norman Jewison],"[new york city, italian american, lover, full ...",2039,Alors que son mariage avec un vieil ami approc...,12.34,1987-12-16,6.85,829,https://www.imdb.com/title/tt0093565,https://image.tmdb.org/t/p/w500/xmK7CWzkRpMEP1...,https://www.youtube.com/watch?v=JfPdrRHXfGo
1140,tt0463998,Écrire pour exister,"[Crime, Drame]","[Hilary Swank, Scott Glenn, Imelda Staunton]",[Richard LaGravenese],"[black people, based on novel or book, holocau...",1646,"Erin Gruwell, enseignante novice de 23 ans, a ...",34.23,2007-01-05,7.90,2033,https://www.imdb.com/title/tt0463998,https://image.tmdb.org/t/p/w500/t6tqTwQYzCiImC...,https://www.youtube.com/watch?v=OYg_IorBaT4
1182,tt15789038,Élémentaire,"[Animation, Comédie, Familial, Fantastique, Ro...","[Leah Lewis, Mamoudou Athie]",[Peter Sohn],"[fire, computer animation, duringcreditsstinge...",976573,"Dans la ville d’Element City, le feu, l’eau, l...",474.38,2023-06-14,7.70,2960,https://www.imdb.com/title/tt15789038,https://image.tmdb.org/t/p/w500/rzY5kUJJ1zGfin...,https://www.youtube.com/watch?v=Z-TOoh6x-4g
2990,tt6021478,Épouse-moi mon pote,[Comédie],"[Tarek Boudali, Philippe Lacheau, Charlotte Ga...",[Tarek Boudali],"[gay marriage, pretending to be gay, gay theme]",432613,"Yassine, jeune étudiant marocain vient à Paris...",8.57,2017-10-25,6.33,912,https://www.imdb.com/title/tt6021478,https://image.tmdb.org/t/p/w500/gRMs1FweDTh5Jn...,https://www.youtube.com/watch?v=0illlpw6TZ0


In [7]:
col_rename = {
    "imdb_id": "titre_id",
    "id": "tmdb_id",
    "title": "titre_str",
    "genres": "titre_genres",
    "actors": "actors",
    "director": "director",
    "keywords": "keywords",
    "overview": "overview",
    "popularity": "popularity",
    "release_date": "date",
    "vote_average": "rating_avg",
    "vote_count": "rating_vote",
    "poster_path": "image",
}

In [8]:
df.rename(columns=col_rename, inplace=True)

In [9]:
df.sort_values(by="date", inplace=True)

In [10]:
df.reset_index(drop="index", inplace=True)

In [11]:
tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
]
for t in tt:
    df[t] = df[t].apply(lambda x: ", ".join(map(str, x))).replace(" ", "")

In [12]:
df["titre_clean"] = df["titre_str"]
df["titre_clean"] = df["titre_clean"].apply(lambda x: x.lower())
df["date"] = pd.to_datetime(df["date"])
df["date"] = df["date"].dt.year

In [13]:
name = "clean_datasets/site_web.parquet"
df.reset_index(drop="index", inplace=True)
df.to_parquet(name)

In [15]:
def clean_overview(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words("french")]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [16]:
def supprimer_accents(texte):
    texte_clean = normalize("NFKD", texte)
    return "".join([c for c in texte_clean if not combining(c)])


tt = [
    "actors",
    "titre_genres",
    "keywords",
    "director",
    "titre_clean",
    "overview",
]
for t in tt:
    df[t] = df[t].astype(str).apply(supprimer_accents)

In [17]:
t = df[df["actors"].str.contains("Funes")]
print(t.to_markdown())

|     | titre_id   | titre_str                    | titre_genres             | actors                                         | director     | keywords                                                                                                                                   |   tmdb_id | overview                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |   popularity |   date |   rating_

In [18]:
print("cleaning overview")
df["overview"] = df["overview"].astype(str).apply(clean_overview)

cleaning overview


In [19]:
def full_lower(text: str):
    # return text.replace(" ", "").replace("-", "").lower()
    return (
        text.replace(" ", "")
        .replace("-", "")
        .replace("'", "")
        .replace(":", "")
        .lower()
    )

tt = [
    "actors",
    "titre_genres",
    "director",
    "keywords",
    "titre_clean",
]
for t in tt:
    print(f"lowering everything in {t}")
    df[t] = df[t].apply(full_lower)

lowering everything in actors
lowering everything in titre_genres
lowering everything in director
lowering everything in keywords
lowering everything in titre_clean


In [20]:
name = "clean_datasets/machine_learning_final.parquet"
df.reset_index(drop="index", inplace=True)
df.to_parquet(name)