In [7]:
import pandas as pd
import numpy as np
from prophet import Prophet
import jdatetime
import holidays
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.stattools import adfuller
from datetime import datetime

In [8]:

# 1. Date Conversion Function (Corrected)
def persian_to_gregorian(persian_date):
    """Converts a Persian date string to a Gregorian date string."""
    year, month, day = map(int, persian_date.split('-'))
    persian_datetime = jdatetime.datetime(year, month, day)
    gregorian_datetime = persian_datetime.togregorian()
    return gregorian_datetime.strftime('%Y-%m-%d')

In [9]:

# 2. Load Data and Apply Date Conversion
df = pd.read_csv("train.csv")
df['ds'] = df['date'].apply(persian_to_gregorian)
df['ds'] = pd.to_datetime(df['ds'])
df['y'] = df['sale']

In [10]:
print(df.head())
print(df.info())

         date   sale         ds      y
0  1399-05-30  384.0 2020-08-20  384.0
1  1399-05-31  393.0 2020-08-21  393.0
2  1399-06-01  414.0 2020-08-22  414.0
3  1399-06-02  410.0 2020-08-23  410.0
4  1399-06-03  398.0 2020-08-24  398.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    114 non-null    object        
 1   sale    114 non-null    float64       
 2   ds      114 non-null    datetime64[ns]
 3   y       114 non-null    float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 3.7+ KB
None


In [11]:

# 3. Lag Features
df['Sales_lag1'] = df['sale'].shift(1)
df['Sales_lag7'] = df['sale'].shift(7)  # Sales from 1 week ago
df.fillna(0, inplace=True) # Fill NaN values



#4.Stationary
def check_stationarity(series, max_diff=2):

    d = 0  # Number of differencing applied
    stationary_series = series.copy() # So the code doesn't affect you series

    while d <= max_diff: # To check stationary
        result = adfuller(stationary_series.dropna()) # Remove that none value
        print(f'Differencing {d}:') # To see and show number
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])
        print('Critical Values:')
        for key, value in result[4].items():
            print('\t%s: %.3f' % (key, value))

        if result[1] <= 0.05: # check and know if correct
            print("Series is stationary after", d, "differencing(s)")
            return stationary_series, d

        else:
            if d < max_diff: # if the maximum diff is reach, the user has already select the maxium amount to convert, so print it
                stationary_series = stationary_series.diff().dropna()
                d += 1
            else:
                print("Series could not be made stationary within the maximum differencing limit.")
                return None, d  # Indicate failure

    return None, d # Indicates fail to return for max diff

In [12]:

# Example usage (replace df['Sales'] with your actual sales column)
stationary_sales, d_value = check_stationarity(df['sale'])
if stationary_sales is not None:
    print("Number of differencing applied:", d_value)
else:
    print("Could not make sales stationary")
    # 5. Holiday Features
def add_holiday_features(df, country_code='IR'):
    # Get holidays for the specified country
    country_holidays = holidays.country_holidays(country_code, years=df['ds'].dt.year.unique().tolist())

    # Create a holiday feature
    df['holiday'] = df['ds'].apply(lambda date: 1 if date.to_pydatetime().date() in country_holidays else 0)

    # Create features for days leading up to and after holidays (e.g., window of 3 days)
    window = 3
    for n in range(1, window + 1):
        df[f'holiday_plus_{n}'] = df['ds'].apply(lambda date: 1 if (date + pd.Timedelta(days=n)).to_pydatetime().date() in country_holidays else 0)
        df[f'holiday_minus_{n}'] = df['ds'].apply(lambda date: 1 if (date - pd.Timedelta(days=n)).to_pydatetime().date() in country_holidays else 0)
    return df

df = add_holiday_features(df)

Differencing 0:
ADF Statistic: -2.977901
p-value: 0.037010
Critical Values:
	1%: -3.498
	5%: -2.891
	10%: -2.582
Series is stationary after 0 differencing(s)
Number of differencing applied: 0


In [13]:
# 6. Define SMAPE metric
def smape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))) * 100

In [14]:
# 7. Time Series Split for Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)  # Adjust n_splits as needed

smape_scores = []

for train_index, test_index in tscv.split(df):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]
    print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")  # Debugging: Check sizes

    # 8. Train Prophet Model
    model = Prophet()

    model.add_regressor('holiday')
    model.add_regressor('holiday_plus_1')
    model.add_regressor('holiday_minus_1')
    model.add_regressor('holiday_plus_2')
    model.add_regressor('holiday_minus_2')
    model.add_regressor('holiday_plus_3')
    model.add_regressor('holiday_minus_3')
    model.add_regressor('Sales_lag1')
    model.add_regressor('Sales_lag7')

    model.fit(train_df[['ds', 'y', 'holiday', 'holiday_plus_1', 'holiday_minus_1', 'holiday_plus_2', 'holiday_minus_2', 'holiday_plus_3', 'holiday_minus_3','Sales_lag1','Sales_lag7']])

    # 9. Make Future DataFrame
    future = model.make_future_dataframe(periods=len(test_df))
    future = add_holiday_features(future)  # Ensure holiday features are in future dataframe
    future['Sales_lag1'] = train_df['Sales_lag1'].iloc[-1]  # Example: last known value
    future['Sales_lag7'] = train_df['Sales_lag7'].iloc[-1]

    # 10. Predict
    forecast = model.predict(future)

    # 11. Evaluate
    y_true = test_df['y'].values
    y_pred = forecast['yhat'].tail(len(test_df)).values

    smape_score = smape(y_true, y_pred)
    smape_scores.append(smape_score)

print("SMAPE Scores for each fold:", smape_scores)
print("Mean SMAPE Score:", np.mean(smape_scores))

Train size: 19, Test size: 19


12:06:51 - cmdstanpy - INFO - Chain [1] start processing
12:06:52 - cmdstanpy - INFO - Chain [1] done processing


Train size: 38, Test size: 19


12:06:53 - cmdstanpy - INFO - Chain [1] start processing
12:06:54 - cmdstanpy - INFO - Chain [1] done processing


Train size: 57, Test size: 19


12:06:55 - cmdstanpy - INFO - Chain [1] start processing
12:06:56 - cmdstanpy - INFO - Chain [1] done processing


Train size: 76, Test size: 19


12:06:57 - cmdstanpy - INFO - Chain [1] start processing
12:06:58 - cmdstanpy - INFO - Chain [1] done processing


Train size: 95, Test size: 19


12:06:59 - cmdstanpy - INFO - Chain [1] start processing
12:07:00 - cmdstanpy - INFO - Chain [1] done processing


SMAPE Scores for each fold: [np.float64(5.348011032031598), np.float64(18.182943946706978), np.float64(14.884162030832627), np.float64(13.27805161673333), np.float64(23.211511696087143)]
Mean SMAPE Score: 14.980936064478334


In [15]:
import pandas as pd
import jdatetime

import pandas as pd
import jdatetime
from datetime import timedelta

# 12. Generate Future Dates for Prediction (30 days from the end of the dataset)
last_date = df['ds'].max()
future_dates = [last_date + timedelta(days=i) for i in range(1, 31)]
future_df = pd.DataFrame({'ds': future_dates})

# 13. Add Holiday Features to Future Dates
future_df = add_holiday_features(future_df)

# Add lag features with last known values
future_df['Sales_lag1'] = df['Sales_lag1'].iloc[-1]  # Example: last known value
future_df['Sales_lag7'] = df['Sales_lag7'].iloc[-1]

# 14. Predict Future Sales
forecast = model.predict(future_df)

# Prepare DataFrame with Gregorian dates and predicted sales
output_df = pd.DataFrame({
    'date_gregorian': forecast['ds'],
    'sale': forecast['yhat']
})

# Convert Gregorian dates to Persian (Jalali) calendar
def gregorian_to_persian(gregorian_date):
    jdate = jdatetime.date.fromgregorian(date=gregorian_date.date())  # Extract date part only
    return jdate.strftime('%Y-%m-%d')

output_df['date'] = output_df['date_gregorian'].apply(gregorian_to_persian)

# Keep only Persian date and sale columns as required
output_df = output_df[['date', 'sale']]

# Save to CSV
output_df.to_csv('output.csv', index=False)

print("Future sales predictions saved to output.csv")
print(output_df.head())

Future sales predictions saved to output.csv
         date        sale
0  1399-09-22  352.522473
1  1399-09-23  353.142740
2  1399-09-24  354.579030
3  1399-09-25  355.786375
4  1399-09-26  367.554832
