## Dummy Model

### 1. Load Data

In [1]:
import pandas as pd

original_data = pd.read_csv("static/films_jp_box.csv")
original_data.head()

Unnamed: 0,film_id,titre,genre_principale,date_sortie_france,date_sortie_usa,image_url,synopsis,duree,note_moyenne,acteurs,entrees_demarrage_france,entrees_totales_france,budget,recette_usa,recette_reste_du_monde,recette_monde
0,20156,L'Extraordinaire Voyage de Marona,Animation,08/01/2020,00/00/0000,https://www.jpbox-office.com/cinema/images/pos...,"Victime d’un accident, Marona, une petite chie...",1h 32min,Non disponible,Non disponible,12 065,37 632,?,- $,41 568 $,41 568 $
1,19778,Les Filles du Docteur March,Adaptation Livre,01/01/2020,25/12/2019,https://www.jpbox-office.com/cinema/images/pos...,Une nouvelle adaptation du classique de Louisa...,2h 15min,3,Emma Watson (Rôle principal - - Meg March) | T...,242 663,805 211,40 000 000 $,108 101 214 $,92 100 000 $,200 201 214 $
2,18940,Birds of Prey et la fabuleuse histoire de Harl...,Adaptation BD,05/02/2020,07/02/2020,https://www.jpbox-office.com/cinema/images/pos...,"Vous connaissez l'histoire du flic, de l'oisea...",1h 49min,3,Margot Robbie (Rôle principal - - Harley Quinn...,412 178,1 047 460,84 500 000 $,84 158 461 $,117 700 000 $,201 858 461 $
3,18815,"Sonic, le hérisson",Adapt. Jeu Vidéo,12/02/2020,14/02/2020,https://www.jpbox-office.com/cinema/images/pos...,L'histoire du hérisson bleu le plus rapide du ...,1h 40min,3,Jim Carrey (Rôle principal - - Dr. Ivo Robotni...,771 015,2 113 220,95 000 000 $,146 066 470 $,168 755 633 $,314 822 103 $
4,21042,Uncharted,Adapt. Jeu Vidéo,16/02/2022,18/02/2022,https://www.jpbox-office.com/cinema/images/pos...,"Nathan Drake, voleur astucieux et intrépide, e...",1h 55min,3,Tom Holland (Rôle principal - - Nathan Drake) ...,955 650,2 514 261,120 000 000 $,148 648 820 $,252 041 094 $,400 689 914 $


In [2]:
original_data.columns

Index(['film_id', 'titre', 'genre_principale', 'date_sortie_france',
       'date_sortie_usa', 'image_url', 'synopsis', 'duree', 'note_moyenne',
       'acteurs', 'entrees_demarrage_france', 'entrees_totales_france',
       'budget', 'recette_usa', 'recette_reste_du_monde', 'recette_monde'],
      dtype='object')

### 2. Feature selection

In [3]:
dropped_columns= ["film_id", "titre", "genre_principale", "image_url", "synopsis", "note_moyenne", "acteurs", "recette_reste_du_monde"]

def select_data(current_data : pd.DataFrame) -> pd.DataFrame :
    selected_data = current_data.copy()
    for column_to_drop in dropped_columns:
        selected_data = selected_data.drop(column_to_drop, axis=1)

    return selected_data

selected_data = select_data(original_data)
selected_data.head()


Unnamed: 0,date_sortie_france,date_sortie_usa,duree,entrees_demarrage_france,entrees_totales_france,budget,recette_usa,recette_monde
0,08/01/2020,00/00/0000,1h 32min,12 065,37 632,?,- $,41 568 $
1,01/01/2020,25/12/2019,2h 15min,242 663,805 211,40 000 000 $,108 101 214 $,200 201 214 $
2,05/02/2020,07/02/2020,1h 49min,412 178,1 047 460,84 500 000 $,84 158 461 $,201 858 461 $
3,12/02/2020,14/02/2020,1h 40min,771 015,2 113 220,95 000 000 $,146 066 470 $,314 822 103 $
4,16/02/2022,18/02/2022,1h 55min,955 650,2 514 261,120 000 000 $,148 648 820 $,400 689 914 $


In [4]:
selected_data.columns

Index(['date_sortie_france', 'date_sortie_usa', 'duree',
       'entrees_demarrage_france', 'entrees_totales_france', 'budget',
       'recette_usa', 'recette_monde'],
      dtype='object')

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer pour supprimer des colonnes
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

### 3. Clean data

In [6]:
def get_year(date_obj) -> int :
    date_str = str(date_obj)
    tab = date_str.split('/')
    if len(tab)==0 :
        return 0
    
    year_part = str(tab[-1])
    if year_part.isdigit() :
        return int(year_part)
    
    return 0
    
def get_minutes(period) -> int:  
    period_str = str(period)
    tab = period_str.split('h')
    if len(tab) ==0 :
        return 0
        
    total = 0
    for content in tab : 
        number = 0
        is_minutes=False
        if content.find('min') :
            is_minutes = True
            number = content.replace("min", "").strip()
     
        if number.isdigit() :
            number = int(number)
            if not is_minutes :
                number *= 60

            total+=number

    return total

def get_number(spaced_num) -> int:
    spaced_num = str(spaced_num)
    number = spaced_num.replace('$', '')
    number = number.replace(' ', '')
    if not number.isdigit() :
        return 0
    
    return int(number)

def clean_data( current_data : pd.DataFrame) -> pd.DataFrame :
    cleaned_data = current_data.copy()
    cleaned_data["date_sortie_france"] = cleaned_data["date_sortie_france"].apply(lambda x : get_year(x)).astype(int)
    cleaned_data["date_sortie_usa"] = cleaned_data["date_sortie_usa"].apply(lambda x : get_year(x)).astype(int)

    cleaned_data["duree"] = cleaned_data["duree"].apply(lambda x : get_minutes(x)).astype(int)
    
    cleaned_data["entrees_demarrage_france"] = cleaned_data["entrees_demarrage_france"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["entrees_totales_france"] = cleaned_data["entrees_totales_france"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["budget"] = cleaned_data["budget"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["recette_usa"] = cleaned_data["recette_usa"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["recette_monde"] = cleaned_data["recette_monde"].apply(lambda x : get_number(x)).astype(int)
    return cleaned_data 

cleaned_data = clean_data(selected_data)
cleaned_data.head()

Unnamed: 0,date_sortie_france,date_sortie_usa,duree,entrees_demarrage_france,entrees_totales_france,budget,recette_usa,recette_monde
0,2020,0,33,12065,37632,0,0,41568
1,2020,2019,17,242663,805211,40000000,108101214,200201214
2,2020,2020,50,412178,1047460,84500000,84158461,201858461
3,2020,2020,41,771015,2113220,95000000,146066470,314822103
4,2022,2022,56,955650,2514261,120000000,148648820,400689914


In [None]:
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
import numpy as np
from pandas import DataFrame 

def get_year_tab(date_columns : pd.DataFrame):
    return date_columns.applymap(get_year).to_numpy()

def get_minutes_tab(date_columns : pd.DataFrame):
    return date_columns.applymap(get_minutes).to_numpy()

def get_number_tab(number_columns : pd.DataFrame):
    return number_columns.applymap(get_number).to_numpy()

# Pipeline de transformation
preprocessing = make_pipeline(
    ColumnDropper(columns_to_drop=dropped_columns),
    ColumnTransformer(transformers=[
        ('keep_year', FunctionTransformer(get_year_tab), ['date_sortie_france', 'date_sortie_usa']),
        ('get_minutes', FunctionTransformer(get_minutes_tab, validate=False), ['duree']),
        ('get_number', FunctionTransformer(get_number_tab, validate=False), ['entrees_totales_france', 'budget', 'recette_usa', 'recette_monde']),
    ], remainder='passthrough')  # 'passthrough' pour garder 'id'
)



Meilleur alpha trouvé : 26786943447.22933
Score R² train : 0.8314991767639591
Score R² test  : 1.0
RMSE : 96875.98077994295


  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()


### 4. Train model

In [9]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

#y = cleaned_data['entrees_demarrage_france']
#X = cleaned_data.drop('entrees_demarrage_france', axis=1)

#y = original_data['entrees_demarrage_france']
y = original_data['entrees_demarrage_france'].apply(lambda x: get_number(x))
X = original_data.drop('entrees_demarrage_france', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('regressor', LassoCV(cv=5))
])

#lasso = LassoCV(cv=5, random_state=42)
#lasso.fit(X_train, y_train)

full_pipeline.fit(X_train, y_train)

print("Meilleur alpha trouvé :", full_pipeline.named_steps['regressor'].alpha_)
print("Score R² train :", full_pipeline.score(X_train, y_train))

y_pred = full_pipeline.predict(X_test)
print("Score R² test  :", full_pipeline.score(X_test, y_pred))

rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE :", rmse)
#print("RMSE divided by 2000 :", rmse /2000)


Meilleur alpha trouvé : 26786943447.22933
Score R² train : 0.8314991767639591
Score R² test  : 1.0
RMSE : 96875.98077994295


  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()


In [10]:
import joblib

joblib.dump(full_pipeline, "pipeline.joblib")

['pipeline.joblib']