In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("Librairies pour KNN et SVM importées!")

Librairies pour KNN et SVM importées!


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rakkesharv/imdb-5000-movies-multiple-genres-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-5000-movies-multiple-genres-dataset


In [8]:
# Chargement des données
df = pd.read_csv('/kaggle/input/imdb-5000-movies-multiple-genres-dataset/IMDb_All_Genres_etf_clean1.csv')

# Nettoyage
cols_to_keep = ['Movie_Title', 'Year', 'Director', 'Actors', 'Rating', 'Runtime(Mins)', 'main_genre', 'side_genre']
df = df[cols_to_keep]
df = df.dropna(subset=['Movie_Title', 'main_genre'])
df = df.reset_index(drop=True)
df['movie_id'] = df.index

print(f"Dataset: {df.shape}")

Dataset: (5562, 9)


In [10]:
# Creation des features pour KNN/SVM
def create_movie_signature(row):
    """Cree une signature unique pour chaque film"""
    signature = []
    
    # Genres (poids fort)
    if pd.notna(row['main_genre']):
        main_genre = str(row['main_genre']).lower().replace(' ', '_')
        signature.append(f"main_genre_{main_genre}")
    
    if pd.notna(row['side_genre']):
        side_genres = str(row['side_genre']).split(',')
        for genre in side_genres[:2]:
            genre_clean = genre.strip().lower().replace(' ', '_')
            if genre_clean:
                signature.append(f"side_genre_{genre_clean}")
    
    # Director (poids moyen)
    if pd.notna(row['Director']):
        director = str(row['Director']).lower().replace(' ', '_').replace('.', '')
        signature.append(f"director_{director}")
    
    # Top 2 actors (poids moyen)
    if pd.notna(row['Actors']):
        actors = str(row['Actors']).split(',')[:2]
        for actor in actors:
            actor_clean = actor.strip().lower().replace(' ', '_').replace('.', '')
            if actor_clean:
                signature.append(f"actor_{actor_clean}")
    
    # Era (poids faible)
    if pd.notna(row['Year']):
        year = int(row['Year'])
        if year < 1970:
            signature.append("era_classic")
        elif year < 2000:
            signature.append("era_modern")
        else:
            signature.append("era_contemporary")
    
    return ' '.join(signature)

df['movie_signature'] = df.apply(create_movie_signature, axis=1)
print("Signatures des films creees")

Signatures des films creees


#  MODÈLE 1: K-NEAREST NEIGHBORS (KNN)


In [15]:
class KNNRecommenderSimple:
    
    def __init__(self, n_neighbors=50):
        self.n_neighbors = n_neighbors
        self.knn = None
        self.tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
        self.feature_matrix = None
    
    def fit(self, df):
        """Entraine le modele KNN"""
        print("Entrainement du modele KNN...")
        
        # Vectorisation TF-IDF
        self.feature_matrix = self.tfidf.fit_transform(df['movie_signature'])
        
        # Conversion en array dense pour eviter les problemes
        self.feature_matrix_dense = self.feature_matrix.toarray()
        
        # KNN avec similarite cosinus
        self.knn = NearestNeighbors(
            n_neighbors=min(self.n_neighbors, len(df)),
            metric='cosine',
            algorithm='brute'
        )
        self.knn.fit(self.feature_matrix_dense)
        print("KNN entraine!")
        return self
    
    def recommend(self, selected_titles, df, n_recommendations=5, min_rating=7.0):
        """Genere des recommandations KNN"""
        selected_indices = []
        
        # Trouve les indices des films selectionnes
        for title in selected_titles:
            matches = df[df['Movie_Title'].str.lower() == title.lower()]
            if len(matches) > 0:
                selected_indices.append(matches.index[0])
        
        if not selected_indices:
            print("Aucun film valide trouve")
            return pd.DataFrame()
        
        print(f"Films selectionnes: {[df.iloc[idx]['Movie_Title'] for idx in selected_indices]}")
        
        # Vecteur moyen - deja en format dense
        combined_vector = self.feature_matrix_dense[selected_indices].mean(axis=0).reshape(1, -1)
        
        # Trouve les plus proches voisins
        distances, indices = self.knn.kneighbors(
            combined_vector, 
            n_neighbors=min(n_recommendations + len(selected_indices) + 10, len(df))
        )
        
        # Filtre les recommandations
        recommendations = []
        for idx, distance in zip(indices[0], distances[0]):
            if (idx not in selected_indices and 
                df.iloc[idx]['Rating'] >= min_rating):
                
                similarity = 1 - distance
                recommendations.append((idx, similarity))
        
        # Trie et selectionne le top N
        recommendations.sort(key=lambda x: x[1], reverse=True)
        final_indices = [r[0] for r in recommendations[:n_recommendations]]
        
        if not final_indices:
            print("Aucune recommandation trouvee avec les criteres")
            return pd.DataFrame()
        
        results = df.iloc[final_indices].copy()
        results['knn_similarity'] = [r[1] for r in recommendations[:n_recommendations]]
        
        return results

# Test avec la version simplifiee
print("TEST DU MODELE KNN (VERSION SIMPLIFIEE)...")
knn_recommender_simple = KNNRecommenderSimple(n_neighbors=50)
knn_recommender_simple.fit(df)

knn_recommendations_simple = knn_recommender_simple.recommend(test_movies, df, n_recommendations=5)

if not knn_recommendations_simple.empty:
    print("Recommandations KNN (version simplifiee):")
    for idx, row in knn_recommendations_simple.iterrows():
        print(f"- {row['Movie_Title']} ({int(row['Year'])})")
        print(f"  Rating: {row['Rating']} | Genre: {row['main_genre']} | Similarite: {row['knn_similarity']:.3f}")
        print()
else:
    print("Aucune recommandation KNN generee")

TEST DU MODELE KNN (VERSION SIMPLIFIEE)...
Entrainement du modele KNN...
KNN entraine!
Films selectionnes: ['The Dark Knight', 'Inception', 'The Matrix']
Recommandations KNN (version simplifiee):
- The Matrix Reloaded (2003)
  Rating: 7.2 | Genre: Action | Similarite: 0.583

- The Prestige (2006)
  Rating: 8.5 | Genre: Drama | Similarite: 0.536

- Batman Begins (2005)
  Rating: 8.2 | Genre: Action | Similarite: 0.521

- The Dark Knight Rises (2012)
  Rating: 8.4 | Genre: Action | Similarite: 0.501

- Tenet (2020)
  Rating: 7.3 | Genre: Action | Similarite: 0.425



# SVM

In [16]:
class SVMRecommender:
    """Recommandation basee sur SVM - Approche par classification"""
    
    def __init__(self):
        self.svm = None
        self.tfidf = TfidfVectorizer(max_features=2000, stop_words='english')
        self.label_encoder = LabelEncoder()
        self.feature_matrix = None
    
    def fit(self, df):
        """Entraine le modele SVM"""
        print("Entrainement du modele SVM...")
        
        # Vectorisation
        self.feature_matrix = self.tfidf.fit_transform(df['movie_signature'])
        
        # Conversion en array dense
        self.feature_matrix_dense = self.feature_matrix.toarray()
        
        # Utilise les genres comme labels
        genres = df['main_genre'].fillna('Unknown')
        self.encoded_genres = self.label_encoder.fit_transform(genres)
        
        # SVM pour classification
        self.svm = SVC(
            kernel='linear',
            probability=True,
            random_state=42
        )
        
        # Entrainement sur echantillon pour performance
        sample_size = min(2000, len(df))
        X_sample = self.feature_matrix_dense[:sample_size]
        y_sample = self.encoded_genres[:sample_size]
        
        self.svm.fit(X_sample, y_sample)
        print("SVM entraine!")
        return self
    
    def recommend(self, selected_titles, df, n_recommendations=5, min_rating=7.0):
        """Genere des recommandations SVM"""
        selected_indices = []
        
        # Trouve les indices des films selectionnes
        for title in selected_titles:
            matches = df[df['Movie_Title'].str.lower() == title.lower()]
            if len(matches) > 0:
                selected_indices.append(matches.index[0])
        
        if not selected_indices:
            print("Aucun film valide trouve")
            return pd.DataFrame()
        
        print(f"Films selectionnes: {[df.iloc[idx]['Movie_Title'] for idx in selected_indices]}")
        
        # Vecteur moyen des films selectionnes
        combined_vector = self.feature_matrix_dense[selected_indices].mean(axis=0).reshape(1, -1)
        
        # Predict les probabilites pour tous les genres
        genre_probabilities = self.svm.predict_proba(combined_vector)[0]
        
        # Trouve les genres les plus probables
        top_genre_indices = np.argsort(genre_probabilities)[-3:][::-1]
        top_genres = self.label_encoder.inverse_transform(top_genre_indices)
        
        print(f"Genres predits par SVM: {top_genres}")
        
        # Recommande des films dans ces genres
        recommendations = []
        for idx, row in df.iterrows():
            if (idx not in selected_indices and 
                row['Rating'] >= min_rating and
                row['main_genre'] in top_genres):
                
                # Score base sur la probabilite du genre
                genre_score = genre_probabilities[
                    self.label_encoder.transform([row['main_genre']])[0]
                ]
                recommendations.append((idx, genre_score))
        
        # Trie et selectionne
        recommendations.sort(key=lambda x: x[1], reverse=True)
        final_indices = [r[0] for r in recommendations[:n_recommendations]]
        
        if not final_indices:
            print("Aucune recommandation trouvee avec les criteres SVM")
            return pd.DataFrame()
        
        results = df.iloc[final_indices].copy()
        results['svm_confidence'] = [r[1] for r in recommendations[:n_recommendations]]
        
        return results

# Test SVM
print("TEST DU MODELE SVM...")
svm_recommender = SVMRecommender()
svm_recommender.fit(df)

svm_recommendations = svm_recommender.recommend(test_movies, df, n_recommendations=5)

if not svm_recommendations.empty:
    print("Recommandations SVM:")
    for idx, row in svm_recommendations.iterrows():
        print(f"- {row['Movie_Title']} ({int(row['Year'])})")
        print(f"  Rating: {row['Rating']} | Genre: {row['main_genre']} | Confiance: {row['svm_confidence']:.3f}")
        print()
else:
    print("Aucune recommandation SVM generee")

TEST DU MODELE SVM...
Entrainement du modele SVM...
SVM entraine!
Films selectionnes: ['The Dark Knight', 'Inception', 'The Matrix']
Genres predits par SVM: ['Action' 'Biography' 'Animation']
Recommandations SVM:
- Kantara (2022)
  Rating: 9.3 | Genre: Action | Confiance: 0.998

- The Lord of the Rings: The Return of the King (2003)
  Rating: 9.0 | Genre: Action | Confiance: 0.998

- The Lord of the Rings: The Two Towers (2002)
  Rating: 8.8 | Genre: Action | Confiance: 0.998

- The Lord of the Rings: The Fellowship of the Ring (2001)
  Rating: 8.8 | Genre: Action | Confiance: 0.998

- The Empire Strikes Back (1980)
  Rating: 8.7 | Genre: Action | Confiance: 0.998



In [17]:
def compare_recommendations(knn_rec, svm_rec, test_movies, df):
    """Compare les recommandations des deux modeles"""
    print("=" * 70)
    print("COMPARAISON KNN vs SVM")
    print("=" * 70)
    
    print(f"Films de test: {test_movies}")
    
    print(f"\nKNN - {len(knn_rec)} recommandations:")
    print("-" * 50)
    for idx, row in knn_rec.iterrows():
        print(f"   {row['Movie_Title']} ({row['Year']})")
        print(f"   Rating: {row['Rating']} | Genre: {row['main_genre']} | Similarite: {row['knn_similarity']:.3f}")
    
    print(f"\nSVM - {len(svm_rec)} recommandations:")
    print("-" * 50)
    for idx, row in svm_rec.iterrows():
        print(f"   {row['Movie_Title']} ({row['Year']})")
        print(f"   Rating: {row['Rating']} | Genre: {row['main_genre']} | Confiance: {row['svm_confidence']:.3f}")
    
    # Metriques de comparaison
    knn_titles = set(knn_rec['Movie_Title'].str.lower())
    svm_titles = set(svm_rec['Movie_Title'].str.lower())
    
    overlap = knn_titles.intersection(svm_titles)
    print(f"\nOVERLAP: {len(overlap)} film(s) en commun")
    if overlap:
        print(f"   Films communs: {list(overlap)}")
    
    # Diversite des genres
    knn_genres = set(knn_rec['main_genre'])
    svm_genres = set(svm_rec['main_genre'])
    
    print(f"\nDIVERSITE DES GENRES:")
    print(f"   KNN: {len(knn_genres)} genres differents")
    print(f"   SVM: {len(svm_genres)} genres differents")
    
    # Score moyen
    knn_avg_score = knn_rec['knn_similarity'].mean()
    svm_avg_score = svm_rec['svm_confidence'].mean()
    
    print(f"\nSCORES MOYENS:")
    print(f"   KNN: {knn_avg_score:.3f}")
    print(f"   SVM: {svm_avg_score:.3f}")

# Comparaison
if not knn_recommendations_simple.empty and not svm_recommendations.empty:
    compare_recommendations(knn_recommendations_simple, svm_recommendations, test_movies, df)
else:
    print("Impossible de comparer - un des modeles n'a pas genere de recommandations")

COMPARAISON KNN vs SVM
Films de test: ['The Dark Knight', 'Inception', 'The Matrix']

KNN - 5 recommandations:
--------------------------------------------------
   The Matrix Reloaded (2003)
   Rating: 7.2 | Genre: Action | Similarite: 0.583
   The Prestige (2006)
   Rating: 8.5 | Genre: Drama | Similarite: 0.536
   Batman Begins (2005)
   Rating: 8.2 | Genre: Action | Similarite: 0.521
   The Dark Knight Rises (2012)
   Rating: 8.4 | Genre: Action | Similarite: 0.501
   Tenet (2020)
   Rating: 7.3 | Genre: Action | Similarite: 0.425

SVM - 5 recommandations:
--------------------------------------------------
   Kantara (2022)
   Rating: 9.3 | Genre: Action | Confiance: 0.998
   The Lord of the Rings: The Return of the King (2003)
   Rating: 9.0 | Genre: Action | Confiance: 0.998
   The Lord of the Rings: The Two Towers (2002)
   Rating: 8.8 | Genre: Action | Confiance: 0.998
   The Lord of the Rings: The Fellowship of the Ring (2001)
   Rating: 8.8 | Genre: Action | Confiance: 0.998


In [18]:
# Explication du fonctionnement KNN
print("POURQUOI KNN FONCTIONNE MIEUX:")
print("=" * 50)

# Montre les similarités détectées par KNN
selected_features = knn_recommender_simple.tfidf.transform([
    df[df['Movie_Title'] == 'The Dark Knight']['movie_signature'].iloc[0],
    df[df['Movie_Title'] == 'Inception']['movie_signature'].iloc[0],
    df[df['Movie_Title'] == 'The Matrix']['movie_signature'].iloc[0]
])

# Features importantes pour les films sélectionnés
feature_names = knn_recommender_simple.tfidf.get_feature_names_out()
for i, movie in enumerate(["The Dark Knight", "Inception", "The Matrix"]):
    print(f"\n{movie}:")
    feature_scores = selected_features[i].toarray().flatten()
    top_features_idx = np.argsort(feature_scores)[-5:][::-1]
    for idx in top_features_idx:
        if feature_scores[idx] > 0:
            print(f"  - {feature_names[idx]}: {feature_scores[idx]:.3f}")

POURQUOI KNN FONCTIONNE MIEUX:

The Dark Knight:
  - director_christopher_nolan: 0.561
  - actor_heath_ledger: 0.561
  - actor_christian_bale: 0.500
  - side_genre_crime: 0.234
  - main_genre_action: 0.178

Inception:
  - director_christopher_nolan: 0.478
  - actor_joseph_gordon: 0.451
  - levitt: 0.451
  - actor_leonardo_dicaprio: 0.432
  - side_genre_sci: 0.235

The Matrix:
  - lana_wachowski: 0.481
  - _lilly_wachowski: 0.473
  - actor_laurence_fishburne: 0.453
  - actor_keanu_reeves: 0.384
  - director_directors: 0.228
