In [2]:
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import os

In [5]:
def load_tsv(file_path):
    try:
        data = pd.read_csv(file_path, sep='\t', low_memory=False)
        if 'streamlit' in globals(): st.write(f"Fichier chargé : {file_path}")
        if 'streamlit' in globals(): st.write(f"Colonnes disponibles : {list(data.columns)}")
        return data
    except FileNotFoundError:
        st.error(f"Fichier non trouvé : {file_path}")
        return None
    except Exception as e:
        st.error(f"Erreur lors du chargement du fichier {file_path} : {e}")
        return None

# Charger les fichiers nécessaires
# Mise à jour avec les chemins corrects pour les fichiers uploadés
def load_data():
    basics = load_tsv(r"data/title.basics-10k.tsv")
    ratings = load_tsv(r"data/title.ratings-10k.tsv")
    crew = load_tsv(r"data/title.crew-10k.tsv")
    principals = load_tsv(r"data/title.principals-10k.tsv")
    names = load_tsv(r"data/name.basics-10k.tsv")
    return basics, ratings, crew, principals, names

# Étape 2 : Préparer et fusionner les données
def prepare_data():
    """Préparer et fusionner les données nécessaires."""
    # Charger les fichiers
    basics, ratings, crew, principals, names = load_data()

    if basics is None or ratings is None or principals is None or names is None:
        st.stop()

    # Vérification des colonnes essentielles
    required_columns = {
        "basics": ['tconst', 'primaryTitle', 'genres'],
        "ratings": ['tconst', 'averageRating', 'numVotes'],
        "principals": ['tconst', 'nconst', 'category'],
        "names": ['nconst', 'primaryName']
    }

    for dataset_name, columns in required_columns.items():
        dataset = locals()[dataset_name]
        for col in columns:
            if col not in dataset.columns:
                st.error(f"La colonne '{col}' est absente dans le fichier {dataset_name}.")
                st.stop()

    # Fusion basics et ratings
    df = pd.merge(basics, ratings, on='tconst', how='inner')

    # Fusion principals avec names pour obtenir primaryName
    principals = pd.merge(principals, names[['nconst', 'primaryName']], on='nconst', how='left')

    # Fusion principale avec primaryName et category
    df = pd.merge(df, principals[['tconst', 'primaryName', 'category']], on='tconst', how='left')

    # Nettoyage des colonnes et renommage
    df = df.rename(columns={
        'primaryTitle': 'title',
        'averageRating': 'rating',
        'numVotes': 'votes',
        'genres': 'genre',
    })

    final_columns = ['tconst', 'title', 'genre', 'rating', 'votes', 'primaryName', 'category']
    for col in final_columns:
        if col not in df.columns:
            st.error(f"La colonne requise '{col}' est absente après la fusion.")
            st.stop()

    return df[final_columns]


In [31]:
from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer


def recommend_movies(movie_title, df):
    """Recommande des films basés sur le genre, sans répétition dans les tconst et titres."""
    movie = df[df['title'].str.contains(movie_title, case=False, na=False)]
    if movie.empty:
        st.warning("Aucun film trouvé avec ce titre.")
        return pd.DataFrame()
    genre = movie['genre'].iloc[0] if not movie.empty else None
    recommendations = (
        df[(df['genre'] == genre) & (~df['tconst'].isin(movie['tconst'].tolist())) & (~df['title'].isin(movie['title'].tolist()))]
        .sort_values(by='rating', ascending=False)
        .drop_duplicates(subset=['tconst', 'title'])
        .head(5)
        if genre else pd.DataFrame()
    )
    return recommendations

def prepare_model_data(df):
    """Prépare les données pour le modèle prédictif."""
    # Sélection des colonnes pertinentes pour la prédiction
    features = ['genre', 'primaryName', 'category']
    target = 'votes'

    # Remplir les valeurs manquantes
    df = df.dropna(subset=features + [target])

    # Créer les variables X (caractéristiques) et y (cible)
    X = df[features]
    y = df[target]

    return X, y

class MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        self.mlb.fit(X)
        return self

    def transform(self, X):
        return self.mlb.transform(X)

def train_popularity_model(X, y):
    """Entraîne un modèle pour prédire la popularité des films."""
    # Définir un préprocesseur pour encoder les variables catégoriques
    categorical_features = ['primaryName', 'category']
    genre_features = ['genre']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('genre_encoder', Pipeline([
                ('splitter', FunctionTransformer(lambda x: x["genre"].str.split(','))),
                ('mlb', MultiLabelBinarizerWrapper())
            ]), genre_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ]
    )

    # Créer un pipeline avec un modèle RandomForest
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Entraîner le modèle
    model.fit(X_train, y_train)

    return model, X_test, y_test

def predict_popularity(model, genre, actor, category):
    """Prédit la popularité d'un film en fonction de ses caractéristiques."""
    # Vérifiez et assurez-vous que les valeurs sont sous forme de chaînes
    genre = str(genre)
    actor = str(actor)
    category = str(category)

    # Créer une entrée pour prédiction
    input_data = pd.DataFrame({
        'genre': [genre],
        'primaryName': [actor],
        'category': [category]
    })

    # Essayer de prédire et capturer les erreurs
    try:
        prediction = model.predict(input_data)[0]
    except ValueError as e:
        st.error(f"Erreur : Les caractéristiques fournies ne sont pas compatibles avec le modèle. {e}")
        return None

    return prediction



In [32]:
df = prepare_data()
X, y = prepare_model_data(df)
model, X_test, y_test = train_popularity_model(X, y)

genre = df['genre'].unique()[1]
print(genre)
actor = df['primaryName'].unique()[1]
category = df['category'].unique()[1]
if genre not in df['genre'].unique():
    print("Le genre saisi est inconnu. Veuillez sélectionner un genre valide.")
elif actor not in df['primaryName'].unique():
    print("L'acteur ou actrice saisi(e) est inconnu(e). Veuillez sélectionner une valeur valide.")
elif category not in df['category'].unique():
    print("La catégorie saisie est inconnue. Veuillez sélectionner une valeur valide.")
else:
    popularity = predict_popularity(model, genre, actor, category)
    if popularity is not None:
        print(f"La popularité estimée est de : {popularity:.2f} votes")

apres
['Action' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'History' 'Horror' 'Music'
 'Mystery' 'News' 'Romance' 'Short' 'Sport' 'War' 'Western' '\\N']
Animation,Short
La popularité estimée est de : 528.36 votes




In [43]:
def recommend_movies(movie_title, df):
    """Recommande des films basés sur le genre, sans répétition dans les tconst et titres."""
    movie = df[df['title'].str.contains(movie_title, case=False, na=False)]
    if movie.empty:
        st.warning("Aucun film trouvé avec ce titre.")
        return pd.DataFrame()
    genre = movie['genre'].str.split(",").explode().unique() if not movie.empty else []
    print(genre)
    print('|'.join(genre))
    recommendations = (
        df[df['genre'].str.contains('|'.join(genre)) & (~df['tconst'].isin(movie['tconst'].tolist())) & (~df['title'].isin(movie['title'].tolist()))]        .sort_values(by='rating', ascending=False)
        .drop_duplicates(subset=['tconst', 'title'])
        .head(5)
        if len(genre) > 0 else pd.DataFrame()
    )
    return recommendations

df = prepare_data()
recommendations = recommend_movies("The Widow Casey's Return", df)
recommendations

['Comedy' 'Short']
Comedy|Short


Unnamed: 0,tconst,title,genre,rating,votes,primaryName,category
8091,tt0007254,The Red Widow,"Comedy,Romance,Short",9.0,16,,
6966,tt0002434,The Pony Express Girl,"Short,Western",9.0,13,,
6866,tt0002145,The Dream of a Moving Picture Director,"Comedy,Short",9.0,14,,
6967,tt0002437,A Prize Package,"Comedy,Short",9.0,13,,
7391,tt0004260,Love and Bullets,"Comedy,Short",9.0,18,,


In [27]:
def load_tsv(file_path):
    try:
        data = pd.read_csv(file_path, sep='\t', low_memory=False)
        if 'streamlit' in globals(): st.write(f"Fichier chargé : {file_path}")
        if 'streamlit' in globals(): st.write(f"Colonnes disponibles : {list(data.columns)}")
        return data
    except FileNotFoundError:
        st.error(f"Fichier non trouvé : {file_path}")
        return None
    except Exception as e:
        st.error(f"Erreur lors du chargement du fichier {file_path} : {e}")
        return None

# Charger les fichiers nécessaires
def load_data():
    basics = load_tsv(r"data/title.basics-10k.tsv")
    ratings = load_tsv(r"data/title.ratings-10k.tsv")
    crew = load_tsv(r"data/title.crew-10k.tsv")
    principals = load_tsv(r"data/title.principals-10k.tsv")
    names = load_tsv(r"data/name.basics-10k.tsv")
    return basics, ratings, principals, names

basics, ratings, principals, names = load_data()

if basics is None or ratings is None or principals is None or names is None:
    st.stop()

required_columns = {
    "basics": ['tconst', 'primaryTitle', 'genres'],
    "ratings": ['tconst', 'averageRating', 'numVotes'],
    "principals": ['tconst', 'nconst', 'category'],
    "names": ['nconst', 'primaryName']
}

for dataset_name, columns in required_columns.items():
    dataset = locals()[dataset_name]
    for col in columns:
        if col not in dataset.columns:
            st.error(f"La colonne '{col}' est absente dans le fichier {dataset_name}.")
            st.stop()

df = pd.merge(basics, ratings, on='tconst', how='inner')
principals = pd.merge(principals, names[['nconst', 'primaryName']], on='nconst', how='left')
df = pd.merge(df, principals[['tconst', 'primaryName', 'category']], on='tconst', how='left')
df.head(35)

# df = df.rename(columns={
#     'primaryTitle': 'title',
#     'averageRating': 'rating',
#     'numVotes': 'votes',
#     'genres': 'genre'
# })

# final_columns = ['tconst', 'title', 'genre', 'rating', 'votes', 'primaryName', 'category']
# for col in final_columns:
#     if col not in df.columns:
#         st.error(f"La colonne requise '{col}' est absente après la fusion.")
#         st.stop()

# df[final_columns]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,primaryName,category
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,,self
1,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,William K.L. Dickson,director
2,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,William K.L. Dickson,producer
3,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,,cinematographer
4,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.6,285,,director
5,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.6,285,,composer
6,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,director
7,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,producer
8,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,producer
9,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,composer


In [25]:
df = pd.merge(df, principals[['tconst', 'primaryName', 'category']], on='tconst', how='left')
df.head(35)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,primaryName,category
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,,self
1,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,William K.L. Dickson,director
2,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,William K.L. Dickson,producer
3,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2118,,cinematographer
4,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.6,285,,director
5,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.6,285,,composer
6,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,director
7,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,producer
8,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,producer
9,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance",6.4,2158,,composer
