In [176]:
import pandas as pd 

df_cleaned = pd.read_csv('df_cleaned.csv')

In [177]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5619 entries, 0 to 5618
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  5619 non-null   object 
 1   release_date           5619 non-null   object 
 2   genre                  5619 non-null   object 
 3   duration               5619 non-null   int64  
 4   director               5619 non-null   object 
 5   producers              5619 non-null   object 
 6   cast                   5619 non-null   object 
 7   nationality            5619 non-null   object 
 8   distributor            5619 non-null   object 
 9   box_office_first_week  5619 non-null   int64  
 10  press_eval             5619 non-null   float64
 11  viewers_eval           5619 non-null   float64
 12  views                  5619 non-null   float64
 13  budget                 5619 non-null   float64
 14  Temperature Moyenne    5619 non-null   float64
 15  Lill

In [178]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime

# Instancier la classe SchoolHolidayDates
school_holidays = SchoolHolidayDates()

# Fonction pour vérifier si une date est un jour de vacances scolaires
def check_holiday(date):
    date = date.date()
    return school_holidays.is_holiday(date)

df_cleaned['release_date'] = pd.to_datetime(df_cleaned['release_date'])
# Appliquer la fonction à la colonne "release_date" du DataFrame
df_cleaned['is_holiday'] = df_cleaned['release_date'].apply(check_holiday)


In [179]:
df_stars = pd.read_csv('/home/apprenant/DevIA/Popularity_corner/popularity_corner/stars/stars.csv')

In [180]:
df_cleaned['cast'] = df_cleaned['cast'].apply(lambda x: [actor.strip() for actor in x.split(',')])
df_cleaned['producers'] = df_cleaned['producers'].apply(lambda x: [actor.strip() for actor in x.split(',')])
df_cleaned['director'] = df_cleaned['director'].apply(lambda x: [actor.strip() for actor in x.split(',')])
df_cleaned['distributor'] = df_cleaned['distributor'].apply(lambda x: [actor.strip() for actor in x.split(',')])

In [181]:
stars = df_stars['name'].to_list()

In [182]:
def count_stars(cast):
    count = 0
    for actor in cast:
        if actor in stars:
            count += 1
    return count 

In [183]:
df_cleaned['proportion_stars_actors'] = (df_cleaned['cast'].apply(count_stars)/df_cleaned['cast'].apply(len)).round(2)
df_cleaned['proportion_stars_producers'] = (df_cleaned['producers'].apply(count_stars)/df_cleaned['producers'].apply(len)).round(2)
df_cleaned['proportion_stars_director'] = df_cleaned['director'].apply(count_stars)

In [184]:
import pandas as pd
from collections import Counter

# Supposons que votre DataFrame s'appelle df_cleaned et la colonne des distributeurs est 'distributor'

# Étape 1 : Utiliser explode pour diviser les distributeurs en des distributeurs individuels
df_exploded = df_cleaned.explode('distributor')

# Étape 2 : Utiliser value_counts pour compter le nombre d'occurrences de chaque distributeur
distributor_counts = df_exploded['distributor'].value_counts()

# Étape 3 : Calculer la fréquence des distributeurs 
distributor_frequency = distributor_counts

# Étape 4 : Créer un dictionnaire pour mapper la fréquence à chaque distributeur
distributor_freq_dict = distributor_frequency.to_dict()

# Étape 5 : Mapper la fréquence à chaque ligne du DataFrame et calculer la moyenne s'il y a plusieurs distributeurs
def calculate_avg_frequency(distributors):
    if isinstance(distributors, list):
        return sum([distributor_freq_dict[d] for d in distributors]) / len(distributors)
    else:
        return distributor_freq_dict[distributors]

df_cleaned['distributor_avg_frequency'] = df_cleaned['distributor'].apply(calculate_avg_frequency)

In [185]:
# Convertir toutes les valeurs de la colonne "genre" en minuscules et supprimer les espaces en début et en fin
df_cleaned['genre'] = df_cleaned['genre'].str.lower().str.strip()

# Supprimer les espaces entre les genres
df_cleaned['genre'] = df_cleaned['genre'].str.replace(', ', ',')

# Encodage du genre
genre_encoding = df_cleaned['genre'].str.get_dummies(sep=',')
genre_encoding.columns = ['genre_' + col for col in genre_encoding.columns]

# Concaténation des DataFrames d'encodage avec le DataFrame principal
df_cleaned = pd.concat([df_cleaned, genre_encoding], axis=1)

In [186]:
df_cleaned.columns

Index(['title', 'release_date', 'genre', 'duration', 'director', 'producers',
       'cast', 'nationality', 'distributor', 'box_office_first_week',
       'press_eval', 'viewers_eval', 'views', 'budget', 'Temperature Moyenne',
       'Lille', 'Bordeaux', 'Lyon', 'Paris', 'season', 'year', 'is_holiday',
       'proportion_stars_actors', 'proportion_stars_producers',
       'proportion_stars_director', 'distributor_avg_frequency',
       'genre_action', 'genre_animation', 'genre_arts martiaux',
       'genre_aventure', 'genre_biopic', 'genre_bollywood', 'genre_comédie',
       'genre_comédie dramatique', 'genre_comédie musicale', 'genre_divers',
       'genre_drame', 'genre_epouvante-horreur', 'genre_erotique',
       'genre_espionnage', 'genre_expérimental', 'genre_famille',
       'genre_fantastique', 'genre_guerre', 'genre_historique',
       'genre_judiciaire', 'genre_musical', 'genre_policier', 'genre_péplum',
       'genre_romance', 'genre_science fiction', 'genre_sport event',
   

In [187]:
df_model = df_cleaned[['genre','duration','nationality','box_office_first_week','press_eval','viewers_eval','views','budget','Temperature Moyenne','Lille','Bordeaux','Lyon','Paris','season','is_holiday','proportion_stars_actors','proportion_stars_producers','proportion_stars_director','distributor_avg_frequency','genre_action', 'genre_animation', 'genre_arts martiaux',
       'genre_aventure', 'genre_biopic', 'genre_bollywood', 'genre_comédie',
       'genre_comédie dramatique', 'genre_comédie musicale', 'genre_divers',
       'genre_drame', 'genre_epouvante-horreur', 'genre_erotique',
       'genre_espionnage', 'genre_expérimental', 'genre_famille',
       'genre_fantastique', 'genre_guerre', 'genre_historique',
       'genre_judiciaire', 'genre_musical', 'genre_policier', 'genre_péplum',
       'genre_romance', 'genre_science fiction', 'genre_sport event',
       'genre_thriller', 'genre_western']]

In [188]:
df_model.to_csv('df_model.csv')