In [31]:
import pandas as pd 

df_cleaned = pd.read_csv('df_cleaned_sans_meteo.csv')

In [32]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7519 entries, 0 to 7518
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  7519 non-null   object 
 1   release_date           7519 non-null   object 
 2   genre                  7519 non-null   object 
 3   duration               7519 non-null   int64  
 4   director               7519 non-null   object 
 5   producers              7519 non-null   object 
 6   cast                   7519 non-null   object 
 7   nationality            7519 non-null   object 
 8   distributor            7519 non-null   object 
 9   box_office_first_week  7519 non-null   int64  
 10  press_eval             7519 non-null   float64
 11  viewers_eval           7519 non-null   float64
 12  views                  7519 non-null   float64
 13  budget                 7519 non-null   float64
 14  season                 7519 non-null   object 
 15  year

In [33]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
df_cleaned['release_date'] = pd.to_datetime(df_cleaned['release_date'])
df_cleaned = df_cleaned[df_cleaned['release_date'].dt.year >= 2010]
# Instancier la classe SchoolHolidayDates
school_holidays = SchoolHolidayDates()

# Fonction pour vérifier si une date est un jour de vacances scolaires
def check_holiday(date):
    date = date.date()
    return school_holidays.is_holiday(date)

df_cleaned['release_date'] = pd.to_datetime(df_cleaned['release_date'])
# Appliquer la fonction à la colonne "release_date" du DataFrame
df_cleaned['is_holiday'] = df_cleaned['release_date'].apply(check_holiday)


In [34]:
df_stars = pd.read_csv('/home/apprenant/DevIA/Popularity_corner/popularity_corner/stars/stars.csv')

In [35]:
df_cleaned['cast'] = df_cleaned['cast'].apply(lambda x: [actor.strip() for actor in x.split(',')])
df_cleaned['producers'] = df_cleaned['producers'].apply(lambda x: [actor.strip() for actor in x.split(',')])
df_cleaned['director'] = df_cleaned['director'].apply(lambda x: [actor.strip() for actor in x.split(',')])
df_cleaned['distributor'] = df_cleaned['distributor'].apply(lambda x: [actor.strip() for actor in x.split(',')])

In [36]:
stars = df_stars['name'].to_list()

In [37]:
def count_stars(cast):
    count = 0
    for actor in cast:
        if actor in stars:
            count += 1
    return count 

In [38]:
df_cleaned['stars_actors'] = df_cleaned['cast'].apply(count_stars)
df_cleaned['proportion_stars_producers'] = df_cleaned['producers'].apply(count_stars)
df_cleaned['proportion_stars_director'] = df_cleaned['director'].apply(count_stars)
df_cleaned['stars_producers_director'] = df_cleaned['proportion_stars_producers'] + df_cleaned['proportion_stars_director']

In [39]:
import pandas as pd
from collections import Counter

# Supposons que votre DataFrame s'appelle df_cleaned et la colonne des distributeurs est 'distributor'

# Étape 1 : Utiliser explode pour diviser les distributeurs en des distributeurs individuels
df_exploded = df_cleaned.explode('distributor')

# Étape 2 : Utiliser value_counts pour compter le nombre d'occurrences de chaque distributeur
distributor_counts = df_exploded['distributor'].value_counts()

# Étape 3 : Calculer la fréquence des distributeurs 
distributor_frequency = distributor_counts

# Étape 4 : Créer un dictionnaire pour mapper la fréquence à chaque distributeur
distributor_freq_dict = distributor_frequency.to_dict()

# Étape 5 : Mapper la fréquence à chaque ligne du DataFrame et calculer la moyenne s'il y a plusieurs distributeurs
def calculate_avg_frequency(distributors):
    if isinstance(distributors, list):
        return sum([distributor_freq_dict[d] for d in distributors]) / len(distributors)
    else:
        return distributor_freq_dict[distributors]

df_cleaned['distributor_avg_frequency'] = df_cleaned['distributor'].apply(calculate_avg_frequency)

In [40]:
# Convertir toutes les valeurs de la colonne "genre" en minuscules et supprimer les espaces en début et en fin
df_cleaned['genre'] = df_cleaned['genre'].str.lower().str.strip()

# Supprimer les espaces entre les genres
df_cleaned['genre'] = df_cleaned['genre'].str.replace(', ', ',')

# Encodage du genre
genre_encoding = df_cleaned['genre'].str.get_dummies(sep=',')
genre_encoding.columns = ['genre_' + col for col in genre_encoding.columns]

# Concaténation des DataFrames d'encodage avec le DataFrame principal
df_cleaned = pd.concat([df_cleaned, genre_encoding], axis=1)

In [41]:
df_cleaned.columns

Index(['title', 'release_date', 'genre', 'duration', 'director', 'producers',
       'cast', 'nationality', 'distributor', 'box_office_first_week',
       'press_eval', 'viewers_eval', 'views', 'budget', 'season', 'year',
       'is_holiday', 'stars_actors', 'proportion_stars_producers',
       'proportion_stars_director', 'stars_producers_director',
       'distributor_avg_frequency', 'genre_action', 'genre_animation',
       'genre_arts martiaux', 'genre_aventure', 'genre_biopic',
       'genre_bollywood', 'genre_comédie', 'genre_comédie dramatique',
       'genre_comédie musicale', 'genre_divers', 'genre_drame',
       'genre_epouvante-horreur', 'genre_erotique', 'genre_espionnage',
       'genre_expérimental', 'genre_famille', 'genre_fantastique',
       'genre_guerre', 'genre_historique', 'genre_judiciaire', 'genre_musical',
       'genre_policier', 'genre_péplum', 'genre_romance',
       'genre_science fiction', 'genre_sport event', 'genre_thriller',
       'genre_western'],
    

In [42]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5611 entries, 0 to 7518
Data columns (total 50 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   title                       5611 non-null   object        
 1   release_date                5611 non-null   datetime64[ns]
 2   genre                       5611 non-null   object        
 3   duration                    5611 non-null   int64         
 4   director                    5611 non-null   object        
 5   producers                   5611 non-null   object        
 6   cast                        5611 non-null   object        
 7   nationality                 5611 non-null   object        
 8   distributor                 5611 non-null   object        
 9   box_office_first_week       5611 non-null   int64         
 10  press_eval                  5611 non-null   float64       
 11  viewers_eval                5611 non-null   float64     

In [43]:
df_model = df_cleaned[['duration','nationality','box_office_first_week','views','budget','season','stars_actors','stars_producers_director','distributor_avg_frequency','genre_action', 'genre_animation', 'genre_arts martiaux',
       'genre_aventure', 'genre_biopic', 'genre_bollywood', 'genre_comédie',
       'genre_comédie dramatique', 'genre_comédie musicale', 'genre_divers',
       'genre_drame', 'genre_epouvante-horreur', 'genre_erotique',
       'genre_espionnage', 'genre_expérimental', 'genre_famille',
       'genre_fantastique', 'genre_guerre', 'genre_historique',
       'genre_judiciaire', 'genre_musical', 'genre_policier', 'genre_péplum',
       'genre_romance', 'genre_science fiction', 'genre_sport event',
       'genre_thriller', 'genre_western']]

In [44]:
df_model.to_csv('df_model_sans_meteo2.csv')