In [1]:
import duckdb as db
import pandas as pd
import numpy as np

def sql(query, database = "../data/database.duckdb"):
    con = db.connect(database)
    res = con.execute(query).df()
    con.close()
    return res

In [2]:
#Conversion de notre base de donnée acquise via SQL en DataFrame
data = pd.DataFrame(sql("""
    SELECT
        *
    FROM
       dataframe_view
    """))

ANALYSE EXPLORATOIRE DES DONNEES

In [3]:
data.isna().sum()

tconst                     0
Title                      0
Genres                     0
Year                       0
Runtime                    0
Actors                     0
Director                   0
Writers                 2151
Resume                  1151
Rating                     0
Votes                      0
Budget                     0
Revenue                    0
Profit                     0
ProductionCompanies        0
ProductionCountries    60111
Poster                  2231
dtype: int64

In [4]:
#Creation d'un DF_Copy pour ne pas modifier le DF d'origine
data_copy = data

In [5]:
data_copy.sample(2)

Unnamed: 0,tconst,Title,Genres,Year,Runtime,Actors,Director,Writers,Resume,Rating,Votes,Budget,Revenue,Profit,ProductionCompanies,ProductionCountries,Poster
527,tt0496806,Ocean's Thirteen,"Crime,Thriller",2007,122,"Ray Xifo, Matt Damon, Al Pacino, Eddie Jemison...",Steven Soderbergh,"George Clayton Johnson, David Levien, Brian Ko...",Danny Ocean's team of criminals are back and c...,6.9,377655,85000000,311312624,226312624,"['Village Roadshow Pictures', 'Warner Bros. Pi...","['US', 'US', 'US', 'US']",/pBsZs4zYUiUTemqbikTZ76iQRaU.jpg
63525,tt4279250,La Gunguna,"Action,Comedy,Crime",2015,87,"Isaac Saviñón, Gerardo Mercedes, Janina Irizar...",Ernesto Alemany,Miguel Yarull,The story of a small .22 caliber gun ties toge...,7.0,460,0,0,0,['Productora Ojodepez'],[''],/sGiUFlwijLxL4vEs5xHc6QVyJoy.jpg


In [6]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101563 entries, 0 to 101562
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   tconst               101563 non-null  object 
 1   Title                101563 non-null  object 
 2   Genres               101563 non-null  object 
 3   Year                 101563 non-null  object 
 4   Runtime              101563 non-null  object 
 5   Actors               101563 non-null  object 
 6   Director             101563 non-null  object 
 7   Writers              99412 non-null   object 
 8   Resume               100412 non-null  object 
 9   Rating               101563 non-null  float64
 10  Votes                101563 non-null  int64  
 11  Budget               101563 non-null  int64  
 12  Revenue              101563 non-null  int64  
 13  Profit               101563 non-null  int64  
 14  ProductionCompanies  101563 non-null  object 
 15  ProductionCountri

In [7]:
#60111 valeurs manquantes dans la colonne ProductionCountries
data_copy['ProductionCountries'].isna().sum()
#On remplace les valeurs manquantes par une ['Unknown'] pour garder la structure de la colonne
data_copy['ProductionCountries'] = data_copy['ProductionCountries'].apply(lambda x: ['Unknown'] if pd.isna(x) else x)

In [8]:
#Remplacer les valeurs manquantes des colonnes Writers, Resume, Poster 
data_copy['Writers'] = data_copy['Writers'].apply(lambda x: ['Unknown'] if pd.isna(x) else x)
data_copy['Resume'] = data_copy['Resume'].apply(lambda x: ['Unknown'] if pd.isna(x) else x)
data_copy['Poster'] = data_copy['Poster'].apply(lambda x: ['Unknown'] if pd.isna(x) else x)

In [9]:
data_copy['Resume'].sample(10)

56894    A young turn-of-the-century newspaper man find...
26404    A fiercely independent and unflinchingly candi...
74086    35-year old Magnus Edkvist hates class reunion...
22758    Hot shot teen Sonny Martin stops to help the v...
21303    There is a thin line between money and loyalty...
93293    This documentary from Albert and David Maysles...
84007    A female cop's sister is kidnapped by the Asia...
32536    At an exclusive boys' school, a new gym teache...
26062    When Aleesha's friends throw her a military de...
51566    Buenos Aires at the outskirts of XIX century. ...
Name: Resume, dtype: object

PREMIER ESSAI d'UN KNN SUR YEAR POUR ESSAYER L'ALGO

In [41]:
#import de la bibliothèque scikit-learn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer  



In [11]:
data_copy[data_copy['Year'].str.contains('\\\\N')]

Unnamed: 0,tconst,Title,Genres,Year,Runtime,Actors,Director,Writers,Resume,Rating,Votes,Budget,Revenue,Profit,ProductionCompanies,ProductionCountries,Poster
19705,tt2075291,Cyborg Nemesis: The Dark Rift,"Action,Sci-Fi",\N,100,"Cazzy Golomb, Sasha Mitchell, Terrie Batson, D...",Albert Pyun,Cynthia Curnan,A U.S. Marine special ops team awakens from hy...,6.4,122,0,0,0,[],[],/hiGNRlOk956lvxIBV6dZ5rJGn0l.jpg


In [12]:
#nettoyage de la colonne Year (il semblerait qu'il y ait qu'une ligne avec une valeur '\N')
data_copy['Year'] = data_copy['Year'].replace('\\N', np.nan)
data_copy = data_copy.dropna(subset=['Year'])
#On convertit la colonne Year en int pour pouvoir l'utiliser dans le modèle
data_copy['Year'] = data_copy['Year'].astype(int)
#On vérifie que la colonne Year ne contient plus de valeurs manquantes
data_copy.info()


<class 'pandas.core.frame.DataFrame'>
Index: 101562 entries, 0 to 101562
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   tconst               101562 non-null  object 
 1   Title                101562 non-null  object 
 2   Genres               101562 non-null  object 
 3   Year                 101562 non-null  int64  
 4   Runtime              101562 non-null  object 
 5   Actors               101562 non-null  object 
 6   Director             101562 non-null  object 
 7   Writers              101562 non-null  object 
 8   Resume               101562 non-null  object 
 9   Rating               101562 non-null  float64
 10  Votes                101562 non-null  int64  
 11  Budget               101562 non-null  int64  
 12  Revenue              101562 non-null  int64  
 13  Profit               101562 non-null  int64  
 14  ProductionCompanies  101562 non-null  object 
 15  ProductionCountries  1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy['Year'] = data_copy['Year'].astype(int)


In [20]:
#Tentative de KNN sur Year
X_year = data_copy[['Year']]

knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(X_year)

distances, indices = knn.kneighbors(X_year.iloc[[0]])
print("Distances:", distances)
print("Indices:", indices)

Distances: [[0. 0. 0. 0. 0.]]
Indices: [[ 97 378 120   0 101]]


les résultats indiquent que le knn fonctionnent mais ne sert à rien puisque basé que sur des années 

une fois les indices trouvés : boucle pour afficher le poster

In [29]:
url = 'https://image.tmdb.org/t/p/w500'
for i in indices:
    print(url+data_copy['Poster'].iloc[i])


97     https://image.tmdb.org/t/p/w500/j5jM5pq78ObAXX...
378    https://image.tmdb.org/t/p/w500/2rq96Ihbqb1eU3...
120    https://image.tmdb.org/t/p/w500/zaqam2RNscH5oo...
0      https://image.tmdb.org/t/p/w500/jRXYjXNq0Cs2Tc...
101    https://image.tmdb.org/t/p/w500/pLBb0whOzVDtJv...
Name: Poster, dtype: object


In [30]:
data_copy['Poster'].iloc[97]

'/j5jM5pq78ObAXX1WhTsb115EkLl.jpg'

tentative avec Realisator : pas possible, il faut encoder les données, KNN n'accepte que les données numériques

In [34]:
data_copy.select_dtypes('number').columns

Index(['Year', 'Rating', 'Votes', 'Budget', 'Revenue', 'Profit'], dtype='object')

sur les colonnes numériques (pas très représentatif car DF classé par recettes)

In [42]:
X_num = data_copy.select_dtypes('number')

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', NearestNeighbors(n_neighbors=5, metric='euclidean'))
])


pipeline.fit(X_num)

distances, indices = knn.kneighbors(X_num.iloc[[0]])
print("Distances:", distances)
print("Indices:", indices)
for i in indices:
    print(data_copy['Title'].iloc[i])

Distances: [[0.00000000e+00 3.03787108e+08 9.07704517e+08 1.04859275e+09
  1.21553124e+09]]
Indices: [[0 1 2 3 4]]
0                                        Avatar
1                             Avengers: Endgame
2                                       Titanic
3                      Avatar: The Way of Water
4    Star Wars: Episode VII - The Force Awakens
Name: Title, dtype: object


In [80]:
X_rates = data_copy[['Rating', 'Votes']]

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', NearestNeighbors(n_neighbors=10, metric='euclidean'))
])

film = 1500
pipeline.fit(X_rates)

distances, indices = pipeline.named_steps['knn'].kneighbors(X_rates.iloc[[film]])
print("Distances:", distances)
print("Indices:", indices)
print(data_copy['Title'].iloc[film])
for i in indices[0]:
    print(data_copy['Title'].iloc[i])

Distances: [[168633.58168363 168633.92923995 168639.09439831 168642.43878002
  168643.75309634 168644.2684781  168644.43252455 168647.17001465
  168647.64496185 168647.90622371]]
Indices: [[6289   54   94 2490  111  145  581  227  484   63]]
House of Gucci
The Shawshank Redemption
The Dark Knight
Inception
Fight Club
Forrest Gump
Interstellar
Pulp Fiction
The Matrix
The Godfather
The Lord of the Rings: The Fellowship of the Ring


