## Dummy Model

### 1. Load Data

In [23]:
import pandas as pd

#original_data = pd.read_csv("static/films_jp_box.csv")
original_data = pd.read_csv("static/films_jp_box_V2.csv", sep=';')
#original_data = pd.read_csv("static/massive_jpbox_clean.csv")

print(f"{original_data.shape}")
#original_data.head()


(5300, 15)


In [8]:
#original_data["duree_minutes"].describe()

In [9]:
original_data.columns

Index(['film_id', 'titre', 'genre_principale', 'date_sortie_france',
       'date_sortie_usa', 'synopsis', 'acteurs', 'budget',
       'entrees_demarrage_france', 'entrees_totales_france', 'recette_usa',
       'recette_monde', 'image_url', 'note_moyenne', 'duree'],
      dtype='object')

### 2. Feature selection

In [10]:
#dropped_columns= ["film_id", "titre", "genre_principale", "image_url", "synopsis", "note_moyenne", "acteurs", "recette_reste_du_monde"]
dropped_columns= ["film_id", "titre", "genre_principale", "image_url", "synopsis", "note_moyenne", "acteurs"]
# dropped_columns = ["film_id", "titre", "genre_principale", "genres", "date_sortie_usa",
#                     "synopsis", "realisateur", "acteurs", "pays_origine", 
#                     "box_office_demarrage", "box_office_france", "recette_usa", "recette_monde", 
#                     "image_url",
#                     "note_moyenne"],

def select_data(current_data : pd.DataFrame) -> pd.DataFrame :
    selected_data = current_data.copy()
    for column_to_drop in dropped_columns:
        selected_data = selected_data.drop(column_to_drop, axis=1)

    return selected_data

selected_data = select_data(original_data)
selected_data.head()


Unnamed: 0,date_sortie_france,date_sortie_usa,budget,entrees_demarrage_france,entrees_totales_france,recette_usa,recette_monde,duree
0,08/01/2020,00/00/0000,20 000 000 $,,,- $,1 835 634 $,1h 51min
1,10/02/2010,12/02/2010,150 000 000 $,214 497,334 885,61 937 495 $,77 810 085 $,1h 43min
2,01/01/2020,15/11/2019,10 000 000 $,66 229,105 205,17 156 058 $,11 200 000 $,1h 50min
3,06/01/2010,00/00/0000,,16 820,60 220,,,1h 32min
4,14/04/2010,16/10/2009,14 700 000 $,39 797,103 782,1 585 787 $,6 427 072 $,1h 43min


In [11]:
selected_data.columns

Index(['date_sortie_france', 'date_sortie_usa', 'budget',
       'entrees_demarrage_france', 'entrees_totales_france', 'recette_usa',
       'recette_monde', 'duree'],
      dtype='object')

### 3. Clean data

In [12]:
from dummy_model_utils import get_year, get_minutes, get_number

def clean_data( current_data : pd.DataFrame) -> pd.DataFrame :
    cleaned_data = current_data.copy()
    cleaned_data["date_sortie_france"] = cleaned_data["date_sortie_france"].apply(lambda x : get_year(x)).astype(int)
    cleaned_data["date_sortie_usa"] = cleaned_data["date_sortie_usa"].apply(lambda x : get_year(x)).astype(int)

    cleaned_data["duree"] = cleaned_data["duree"].apply(lambda x : get_minutes(x)).astype(int)
    
    cleaned_data["entrees_demarrage_france"] = cleaned_data["entrees_demarrage_france"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["entrees_totales_france"] = cleaned_data["entrees_totales_france"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["budget"] = cleaned_data["budget"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["recette_usa"] = cleaned_data["recette_usa"].apply(lambda x : get_number(x)).astype(int)
    cleaned_data["recette_monde"] = cleaned_data["recette_monde"].apply(lambda x : get_number(x)).astype(int)
    return cleaned_data 

cleaned_data = clean_data(selected_data)
cleaned_data.head()

Unnamed: 0,date_sortie_france,date_sortie_usa,budget,entrees_demarrage_france,entrees_totales_france,recette_usa,recette_monde,duree
0,2020,0,20000000,0,0,0,1835634,52
1,2010,2010,150000000,214497,334885,61937495,77810085,44
2,2020,2019,10000000,66229,105205,17156058,11200000,51
3,2010,0,0,16820,60220,0,0,33
4,2010,2009,14700000,39797,103782,1585787,6427072,44


In [13]:
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
import numpy as np
from pandas import DataFrame 

from dummy_model_utils import ColumnDropper, get_year_tab, get_minutes_tab, get_number_tab



# Pipeline de transformation
preprocessing = make_pipeline(
    ColumnDropper(columns_to_drop=dropped_columns),
    ColumnTransformer(transformers=[
        ('keep_year', FunctionTransformer(get_year_tab), ['date_sortie_france', 'date_sortie_usa']),
        ('get_minutes', FunctionTransformer(get_minutes_tab, validate=False), ['duree']),
        ('get_number', FunctionTransformer(get_number_tab, validate=False), ['entrees_totales_france', 'budget', 'recette_usa', 'recette_monde']),
    ], remainder='passthrough')  # 'passthrough' pour garder 'id'
)



### 4. Train model

In [14]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

#y = cleaned_data['entrees_demarrage_france']
#X = cleaned_data.drop('entrees_demarrage_france', axis=1)

#y = original_data['entrees_demarrage_france']
y = original_data['entrees_demarrage_france'].apply(lambda x: get_number(x))
X = original_data.drop('entrees_demarrage_france', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('regressor', LassoCV(cv=5))
])

#lasso = LassoCV(cv=5, random_state=42)
#lasso.fit(X_train, y_train)

full_pipeline.fit(X_train, y_train)

print("Meilleur alpha trouvé :", full_pipeline.named_steps['regressor'].alpha_)
print("Score R² train :", full_pipeline.score(X_train, y_train))

y_pred = full_pipeline.predict(X_test)
print("Score R² test  :", full_pipeline.score(X_test, y_pred))

rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE :", rmse)
#print("RMSE divided by 2000 :", rmse /2000)


Meilleur alpha trouvé : 28128634443.278652
Score R² train : 0.8774792695221488
Score R² test  : 1.0
RMSE : 118687.67771175392


  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()
  return date_columns.applymap(get_year).to_numpy()
  return date_columns.applymap(get_minutes).to_numpy()
  return number_columns.applymap(get_number).to_numpy()


In [15]:
import joblib

joblib.dump(full_pipeline, "dummy_pipeline.joblib")

['dummy_pipeline.joblib']