In [43]:
import duckdb as db
import pandas as pd
import numpy as np

import joblib

def sql(query, database = "../data/database.duckdb"):
    con = db.connect(database)
    res = con.execute(query).df()
    con.close()
    return res



In [2]:
#Conversion de notre base de donnée acquise via SQL en DataFrame
data = pd.DataFrame(sql("""
    SELECT
        *
    FROM
       dataframe_view
    """))

ANALYSE EXPLORATOIRE DES DONNEES

In [3]:
#Creation d'un DF_Copy pour ne pas modifier le DF d'origine
data_copy = data

CLEAN DATA AVANT ML

In [None]:
def clean_row(row):
    return row.replace("[", "").replace("]", "").replace("'", "")
data_copy.drop(columns= ['Profit','Revenue', 'Budget'], inplace=True) #trop de valeurs manquantes, DF faussé
data_copy['ProductionCountries'] = data_copy['ProductionCountries'].apply(lambda x: 'Unknown' if x =='\\N' else x)
data_copy['Writers'] = data_copy['Writers'].apply(lambda x: 'Unknown' if x== '\\N'  else x)
data_copy['Resume'] = data_copy['Resume'].apply(lambda x: 'Unknown' if x == '\\N'  else x)
data_copy['Poster'] = data_copy['Poster'].apply(lambda x: 'Unknown' if x == '\\N'  else x)
#nettoyage de la colonne Year (il semblerait qu'il y ait qu'une ligne avec une valeur '\N')
data_copy['Year'] = data_copy['Year'].replace('\\N', np.nan)
data_copy = data_copy.dropna(subset=['Year'])
#On convertit la colonne Year en int pour pouvoir l'utiliser dans le modèle
data_copy['Year'] = data_copy['Year'].astype(int)
data_copy['Resume'].fillna('Unknown', inplace=True)
df_runtime = data_copy[data_copy['Runtime'] != '\\N']
df_runtime['Runtime'] = df_runtime['Runtime'].astype(int)
data_copy['Runtime'] = data_copy['Runtime'].apply(lambda x: df_runtime['Runtime'].median() if x == '\\N' else x)
data_copy['Runtime'] = data_copy['Runtime'].astype(int)
data_copy['Runtime'].describe()
data_copy['ProductionCountries'].fillna('Unknown', inplace=True)
data_copy['ProductionCountries'] = data_copy['ProductionCountries'].apply(clean_row)
data_copy['ProductionCompanies'] = data_copy['ProductionCompanies'].apply(clean_row)
data_copy['ProductionCountries'] = data_copy['ProductionCountries'].apply(
    lambda x: ', '.join(dict.fromkeys(str(x).split(', ')))
)
data_copy['Writers'].fillna('Unknown', inplace=True)

PREMIER ESSAI d'UN KNN SUR YEAR POUR ESSAYER L'ALGO

In [None]:
#import de la bibliothèque scikit-learn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer


une fois les indices trouvés : boucle pour afficher le poster

In [49]:
pd.options.display.max_rows = 999
url = 'https://image.tmdb.org/t/p/w185'
for i in indices[0]:
    a = url+data_copy['Poster'].iloc[i]
    print(a)


https://image.tmdb.org/t/p/w185/cXOLaxcNjNAYmEx1trZxOTKhK3Q.jpg
https://image.tmdb.org/t/p/w185/uluhlXubGu1VxU63X9VHCLWDAYP.jpg
https://image.tmdb.org/t/p/w185/momkKuWburNTqKBF6ez7rvhYVhE.jpg
https://image.tmdb.org/t/p/w185/jRXYjXNq0Cs2TcJjLkki24MLp7u.jpg
https://image.tmdb.org/t/p/w185/z7uo9zmQdQwU5ZJHFpv2Upl30i1.jpg


In [48]:
print(url+data_copy['Poster'].iloc[97])
print(url+data_copy['Poster'].iloc[202])
print(indices[0])

https://image.tmdb.org/t/p/w185/j5jM5pq78ObAXX1WhTsb115EkLl.jpg
https://image.tmdb.org/t/p/w185/uluhlXubGu1VxU63X9VHCLWDAYP.jpg
[ 57 202 203   0  84]


tentative avec Realisator : pas possible, il faut encoder les données, KNN n'accepte que les données numériques

sur les colonnes numériques (pas très représentatif car DF classé par recettes)

In [None]:
X_rates_year = data_copy[['Rating', 'Votes', 'Year']]

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', NearestNeighbors(n_neighbors=10, metric='euclidean'))
])

film = 0
pipeline.fit(X_rates_year)

distances, indices = pipeline.named_steps['knn'].kneighbors(X_rates_year.iloc[[film]])
print("Distances:", distances)
print("Indices:", indices)
print(data_copy['Title'].iloc[film])
for i in indices[0]:
    print(data_copy['Title'].iloc[i])

UTILISATION DE TfidfVectorizer POUR LES COLONNES NON NUMERIQUES

In [None]:
X_text = data_copy[['Resume', 'Title']]

preprocessor = ColumnTransformer(
    transformers= [
        ('Resume', TfidfVectorizer(stop_words='english'), 'Resume'),
        ('Title', TfidfVectorizer(stop_words='english'), 'Title')
    ],
    )  

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', NearestNeighbors(n_neighbors=5, metric='cosine'))

])

pipeline.fit(X_text)
distances, indices = pipeline.named_steps['knn'].kneighbors(X_text.iloc[[0]])
print("Distances:", distances)
print("Indices:", indices)

In [None]:
num_cols = data_copy.select_dtypes('number')
non_num_cols = data_copy.select_dtypes(exclude='number')
non_num_cols.drop(columns=['tconst', 'Poster', 'ProductionCountries'], inplace=True)
non_num_cols = non_num_cols.select_dtypes(include='object').columns.tolist()
num_cols = data_copy.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
# On crée une liste de tuples pour appliquer un TfidfVectorizer à chaque colonne texte
text_transformers = [
    (col, TfidfVectorizer(stop_words='english'), col)
    for col in non_num_cols
]

# On ajoute la partie numérique
all_transformers = text_transformers + [
    ('num', StandardScaler(), num_cols)
]

preprocessor = ColumnTransformer(transformers=all_transformers)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', NearestNeighbors(n_neighbors=10, metric='cosine'))
])

film_index = 3
pipeline.fit(X)
vectorized_data = pipeline.named_steps['preprocessor'].transform(X)
distances, indices = pipeline.named_steps['knn'].kneighbors(vectorized_data.getrow(film_index))
print("Distances:", distances)
print("Indices:", indices)
for i in indices[0]:
    print(data_copy['Title'].iloc[i])
    print(data_copy['Resume'].iloc[i])
    print(data_copy['Director'].iloc[i])
    print()

In [None]:
film_index = 1
distances, indices = pipeline.named_steps['knn'].kneighbors(vectorized_data.getrow(film_index))
print("Distances:", distances)
print("Indices:", indices)
for i in indices[0]:
    print(data_copy['Title'].iloc[i])
    print(data_copy['Resume'].iloc[i])
    print(data_copy['Director'].iloc[i])
    print()

In [44]:
joblib.dump(pipeline, '../KNN/knn_model.pkl')

['../KNN/knn_model.pkl']