In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("clean.csv")

In [3]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import het_breuschpagan

# Supposons que vous avez une formule de modèle, par exemple 'A_values ~ A_units + Avg_Price_unit'
# Création du modèle de régression linéaire
model = ols('A_values ~ A_units + Avg_Price_unit', data=df).fit()

# Réalisation du test de Breusch-Pagan sur les résidus du modèle
_, pvalue, _, _ = het_breuschpagan(model.resid, model.model.exog)

print("P-value du test de Breusch-Pagan:", pvalue)

P-value du test de Breusch-Pagan: 1.0786274831210795e-13


In [4]:
from scipy.stats import kstest, expon

# Échantillon de données
data_sample = df['A_units']

# Ajustement des paramètres de la distribution exponentielle aux données
loc, scale = expon.fit(data_sample)

# Réalisation du test de Kolmogorov-Smirnov
ks_statistic, p_value = kstest(data_sample, 'expon', args=(loc, scale))

print("Statistique KS:", ks_statistic, "P-value:", p_value)

Statistique KS: 0.5767195767195767 P-value: 4.0990247182561265e-60


In [5]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import het_breuschpagan

# Transformation logarithmique de A_values
df['log_A_values'] = np.log(df['A_values'])

# Création du modèle de régression linéaire avec la variable transformée
model_log = ols('log_A_values ~ A_units + Avg_Price_unit', data=df).fit()

# Réalisation du test de Breusch-Pagan sur les résidus du modèle transformé
_, pvalue_log, _, _ = het_breuschpagan(model_log.resid, model_log.model.exog)

print("Résumé du modèle avec transformation logarithmique:")
print(model_log.summary())
print("P-value du test de Breusch-Pagan après transformation logarithmique:", pvalue_log)


Résumé du modèle avec transformation logarithmique:
                            OLS Regression Results                            
Dep. Variable:           log_A_values   R-squared:                       0.702
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     219.0
Date:                Fri, 12 Apr 2024   Prob (F-statistic):           1.28e-49
Time:                        15:37:36   Log-Likelihood:                -159.30
No. Observations:                 189   AIC:                             324.6
Df Residuals:                     186   BIC:                             334.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

In [6]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calcul des VIF pour chaque variable explicative
variables = model_log.model.exog
vif = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]

# Affichage des VIF pour chaque variable explicative
vif_dataframe = pd.DataFrame({'Variable': model_log.model.exog_names, 'VIF': vif})
print(vif_dataframe)

         Variable        VIF
0       Intercept  15.442136
1         A_units   1.062211
2  Avg_Price_unit   1.062211


In [7]:
from scipy.stats import spearmanr

# Calcul de la corrélation de Spearman entre A_units et A_values
spearman_corr, spearman_pvalue = spearmanr(df['A_units'], df['A_values'])

print("Corrélation de Spearman entre A_units et A_values:", spearman_corr)
print("P-value de Spearman:", spearman_pvalue)

Corrélation de Spearman entre A_units et A_values: 0.8795210334909249
P-value de Spearman: 3.229497379621004e-62


In [8]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

predictions_dict = {}

for territory in df['Territory_label'].unique():
    territory_data = df[df['Territory_label'] == territory].sort_values('Months_since_Release')
    if len(territory_data) >= 3:
        # Ajout d'une constante pour éviter les valeurs zéro avant de prendre le logarithme
        territory_data['A_units_log'] = np.log(territory_data['A_units'] + 1)  # Ajouter 1 pour éviter log(0)

        # Ajustement du modèle de lissage exponentiel sur les données transformées
        model = ExponentialSmoothing(territory_data['A_units_log'], trend='add', seasonal=None, seasonal_periods=3)
        fit_model = model.fit()

        # Prédiction et retransformation
        predictions_log = fit_model.forecast(9)
        predictions = np.exp(predictions_log) - 1  # Retirer 1 après avoir pris l'exponentielle

        predictions_dict[territory] = predictions
    else:
        print(f"Not enough data to forecast for {territory}")

# Affichage des prédictions
for territory, prediction in predictions_dict.items():
    print(f"Predictions for {territory}:")
    print(prediction)

Predictions for EASTERN EUROPE:
9     105.814794
10     91.459402
11     79.033305
12     68.277215
13     58.966692
14     50.907459
15     43.931347
16     37.892791
17     32.665788
dtype: float64
Predictions for FRANCE:
18    68.211469
19    57.756303
20    48.880507
21    41.345499
22    34.948739
23    29.518280
24    24.908153
25    20.994437
26    17.671932
dtype: float64
Predictions for MIDDLE EAST:
9     81.116549
10    65.812668
11    53.360938
12    43.229810
13    34.986797
14    28.280017
15    22.823164
16    18.383294
17    14.770872
dtype: float64
Predictions for NORDICS:
18    101.563794
19     93.607868
20     86.269086
21     79.499578
22     73.255184
23     67.495171
24     62.181965
25     57.280907
26     52.760027
dtype: float64
Predictions for BENE:
18    82.940739
19    76.336118
20    70.251161
21    64.644980
22    59.479905
23    54.721228
24    50.336972
25    46.297679
26    42.576205
dtype: float64
Predictions for CENTRAL EUROPE:
9     100.0
10    100.0

  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  

In [9]:
df['Forecasted_A_units'] = np.nan

In [14]:
new_rows = []

for territory, predictions in predictions_dict.items():
    max_month = df[df['Territory_label'] == territory]['Months_since_Release'].max()
    # Generate new rows for the forecast months
    for i, prediction in enumerate(predictions, start=1):
        forecast_month = max_month + i
        new_rows.append({'Territory_label': territory, 'Months_since_Release': forecast_month, 'Forecasted_A_units': prediction})

# Convert list of new rows to DataFrame and append to the existing DataFrame
new_data = pd.DataFrame(new_rows)
df = pd.concat([df, new_data], ignore_index=True)

In [16]:
df.tail(69)

Unnamed: 0,Months_since_Release,Format,A_units,A_values,Platform_Label,Territory_label,Avg_Price_unit,log_A_values,Forecasted_A_units
255,7,,,,,IBERIA,,,75.889359
256,8,,,,,IBERIA,,,70.458228
257,9,,,,,IBERIA,,,65.410729
258,10,,,,,IBERIA,,,60.719763
259,11,,,,,IBERIA,,,56.360147
...,...,...,...,...,...,...,...,...,...
319,8,,,,,UNITED KINGDOM & IRELAND,,,35.287415
320,9,,,,,UNITED KINGDOM & IRELAND,,,30.113884
321,10,,,,,UNITED KINGDOM & IRELAND,,,25.677948
322,11,,,,,UNITED KINGDOM & IRELAND,,,21.874448


In [17]:
df['Discount_Forecast'] = np.where(
    df['Months_since_Release'] == 6,
    df['Forecasted_A_units'] * 1.05,  # Add 5% for month 6
    np.where(
        df['Months_since_Release'] == 8,
        df['Forecasted_A_units'] * 1.01,  # Add 1% for month 8
        df['Forecasted_A_units']  # No adjustment for other months
    )
)

In [18]:
df

Unnamed: 0,Months_since_Release,Format,A_units,A_values,Platform_Label,Territory_label,Avg_Price_unit,log_A_values,Forecasted_A_units,Discount_Forecast
0,3,DIGITAL,300.0,11217.48925,PS4,EASTERN EUROPE,37.391631,9.325229,,
1,3,DIGITAL,300.0,13831.88372,PS4,FRANCE,46.106279,9.534732,,
2,3,DIGITAL,300.0,12625.53589,PS4,MIDDLE EAST,42.085120,9.443477,,
3,3,DIGITAL,300.0,14551.99476,PS4,NORDICS,48.506649,9.585483,,
4,3,DIGITAL,100.0,4491.29932,PC,BENE,44.912993,8.409897,,
...,...,...,...,...,...,...,...,...,...,...
319,8,,,,,UNITED KINGDOM & IRELAND,,,35.287415,35.640290
320,9,,,,,UNITED KINGDOM & IRELAND,,,30.113884,30.113884
321,10,,,,,UNITED KINGDOM & IRELAND,,,25.677948,25.677948
322,11,,,,,UNITED KINGDOM & IRELAND,,,21.874448,21.874448


In [19]:
df.to_csv('forecasts.csv', index=False)