In [None]:
pip install prophet

In [1]:
import numpy as np
import pandas as pd

from prophet import Prophet

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('/kaggle/input/forecast/Train (11).csv')
sample_submission = pd.read_csv('/kaggle/input/forecast/SampleSubmission (18).csv')
train.shape, sample_submission.shape

((289926, 19), (370, 2))

In [None]:
train["currency"].value_counts()

# Data preprocessing

In [3]:
# train data
train['date'] = pd.to_datetime(train['date']) # convert data to datetime object
train.rename(columns={'date': 'ds', 'clicks': 'y'}, inplace=True) # the model expects the date and target features name to be 'ds', 'y' respectively
train_grouped = train.groupby(['ID', 'ds'])['y'].sum().to_frame() # we predict for each ID, so it is ideal we group by ID's
train_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,y
ID,ds,Unnamed: 2_level_1
ID_5da86e71bf5dee4cf5047046,2020-01-01,36.0
ID_5da86e71bf5dee4cf5047046,2020-01-02,50.0
ID_5da86e71bf5dee4cf5047046,2020-01-03,54.0
ID_5da86e71bf5dee4cf5047046,2020-01-04,47.0
ID_5da86e71bf5dee4cf5047046,2020-01-05,59.0
...,...,...
ID_65b0f65c7fe62e56c5593d55,2024-02-09,37.0
ID_65b0f65c7fe62e56c5593d55,2024-02-10,55.0
ID_65b0f65c7fe62e56c5593d55,2024-02-11,94.0
ID_65b0f65c7fe62e56c5593d55,2024-02-12,78.0


In [4]:
# test data
# the dates to predicts from are in sample_submission['ID'] e.g. ID_5da86e71bf5dee4cf5047046_2024_01_22, so we extract them.
test = sample_submission['ID'].copy()
test = test.to_frame()

target_dates = []
for id in test.ID:
  date = id[-10:]
  target_dates.append(date)

test['ds'] = target_dates
test['ds'] = pd.to_datetime(test['ds'], format='%Y_%m_%d')
test.drop(['ID'], axis=1, inplace=True)
test.head()

Unnamed: 0,ds
0,2024-01-22
1,2024-01-29
2,2024-02-21
3,2024-02-28
4,2023-12-04


# Forcasting with Prophet

In [None]:
from prophet.make_holidays import make_holidays_df
from prophet.diagnostics import cross_validation

In [None]:
horizon_days = int(0.25 * 1461)  # 25% of the initial period
horizon = f'{horizon_days} days'

In [None]:
horizon

In [None]:
def forecast_for_id(train_data, test_data, id):
    train_id = train_data.loc[id].reset_index()

    train_id.columns = ['ds', 'y']
    
    #holidays = make_holidays_df(year_list=[2020, 2021, 2022,2023,2024],country='US')

    model = Prophet(n_changepoints=27,changepoint_prior_scale=0.08,changepoint_range=0.5,
                    seasonality_prior_scale=7, holidays_prior_scale=7,
                      )
    
    
   # model.add_country_holidays(country_name='US')
   # model.add_seasonality(name='weekly', period=7, fourier_order=1)
    model.fit(train_id)
    
    #cv_results = cross_validation(model)

    # forecast for the next 2 dates in test data
    future_dates = test_data.iloc[:2]
    future_dates.columns = ['ds']
    forecast = model.predict(future_dates)

    return forecast[['ds', 'yhat']]

# now we loop over unique IDs in train data and forecast
forecasts = []
for id in train_grouped.index.levels[0]:
    forecasts.append(forecast_for_id(train_grouped, test, id))


all_forecasts = pd.concat(forecasts) # concatenate forecasts for all ID's
test['target'] = all_forecasts['yhat'].values # add predicted values to test data


In [None]:
from statsmodels.tsa.arima.model import ARIMA

def forecast_for_id(train_data, test_data, id):
    train_id = train_data.loc[id].reset_index()
    train_id.columns = ['ds', 'y']

    # Train ARIMA model
    model = ARIMA(train_id['y'], order=(5,1,0))  # Example order: ARIMA(5,1,0)
    model_fit = model.fit()

    # Forecast for the next 2 dates in test data
    future_dates = test_data.iloc[:2]
    future_dates.columns = ['ds']
    forecast = model_fit.forecast(steps=2)  # Forecast 2 steps ahead

    # Convert forecast to DataFrame
    forecast_df = pd.DataFrame({'ds': future_dates['ds'].values, 'yhat': forecast})

    return forecast_df

# now we loop over unique IDs in train data and forecast
forecasts = []
for id in train_grouped.index.levels[0]:
    forecasts.append(forecast_for_id(train_grouped, test, id))

all_forecasts = pd.concat(forecasts)  # concatenate forecasts for all IDs

test['target'] = all_forecasts['yhat'].values  # add predicted values to test data


In [6]:
pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4
Note: you may need to restart the kernel to use updated packages.


In [7]:
from pmdarima import auto_arima
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

def forecast_for_id(train_data, test_data, id):
    train_id = train_data.loc[id].reset_index()
    train_id.columns = ['ds', 'y']

    # Define TimeSeriesSplit for cross-validation
    tscv = TimeSeriesSplit(n_splits=5) ## try another cross validation 

    # Perform auto ARIMA model selection with cross-validation
    model = auto_arima(train_id['y'], seasonal=False, trace=True, error_action='ignore', suppress_warnings=True, scoring='mse', cv=tscv)

    # Forecast for the next 2 dates in test data
    future_dates = test_data.iloc[:2]
    future_dates.columns = ['ds']
    forecast = model.predict(n_periods=2)  # Forecast 2 steps ahead

    # Convert forecast to DataFrame
    forecast_df = pd.DataFrame({'ds': future_dates['ds'].values, 'yhat': forecast})

    return forecast_df

# now we loop over unique IDs in train data and forecast
forecasts = []
for id in train_grouped.index.levels[0]:
    forecasts.append(forecast_for_id(train_grouped, test, id))

all_forecasts = pd.concat(forecasts)  # concatenate forecasts for all IDs

test['target'] = all_forecasts['yhat'].values  # add predicted values to test data


Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=13135.007, Time=2.04 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=13802.624, Time=0.09 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=13490.149, Time=0.09 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=13145.041, Time=0.14 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=13800.630, Time=0.03 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=13129.066, Time=2.34 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=13132.403, Time=0.51 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=13131.879, Time=0.34 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=13135.675, Time=1.73 sec
 ARIMA(0,1,3)(0,0,0)[0] intercept   : AIC=13134.206, Time=0.88 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=13133.457, Time=0.45 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=inf, Time=3.67 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=13127.704, Time=0.58 sec
 ARIMA(0,1,2)(0,0,0)[0]             : AIC=13130.667, Time=0.14 sec
 ARIMA(1,1,1)(0,0,0)[0]  

You can also experiment with other time series models, ARIMA or traditional ML models.

In [9]:
# save prediction to file
sample_submission['Target'] = all_forecasts['yhat'].values
sample_submission.drop(['clicks'], inplace=True, axis=1) # you might have to comment this out
sample_submission.to_csv('ccc.csv', index=False)

In [None]:
#sample_submission.to_csv('cli2pp2.csv', index=False)

In [8]:
from sklearn.metrics import mean_squared_error
import numpy as np

def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Assuming 'test' dataframe contains actual values for comparison
y_true = train['y'][:370].values  # Replace 'actual_target_column' with the actual column name
y_pred = test['target'].values

rmse = calculate_rmse(y_true, y_pred)
print("Root Mean Squared Error (RMSE):", rmse)


Root Mean Squared Error (RMSE): 83.74531436173734
