In [28]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/content/train.csv', parse_dates=['date'])
print(df.head(5))

    id       date    city       lat      long       pop    shop        brand  \
0  0.0 2012-01-31  Athens  37.97945  23.71622  672130.0  shop_1  kinder-cola   
1  1.0 2012-01-31  Athens  37.97945  23.71622  672130.0  shop_1  kinder-cola   
2  2.0 2012-01-31  Athens  37.97945  23.71622  672130.0  shop_1  kinder-cola   
3  3.0 2012-01-31  Athens  37.97945  23.71622  672130.0  shop_1   adult-cola   
4  4.0 2012-01-31  Athens  37.97945  23.71622  672130.0  shop_1   adult-cola   

  container capacity  price  quantity  
0     glass    500ml   0.96   13280.0  
1   plastic    1.5lt   2.86    6727.0  
2       can    330ml   0.87    9848.0  
3     glass    500ml   1.00   20050.0  
4       can    330ml   0.39   25696.0  


  df = pd.read_csv('/content/train.csv', parse_dates=['date'])


In [3]:
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce')
df = df.dropna(subset=['price', 'quantity'])


In [4]:
groups = df.groupby('city')

In [5]:
models = {}
results = {}

In [6]:
for name, group in groups:
    print(f'Processing {name}')


Processing Athens
Processing Irakleion
Processing Larisa
Processing Patra
Processing Thessaloniki


In [7]:
group = group.sort_values(by='date')

In [9]:
group['sales'] = group['price'] * group['quantity']
daily_sales = group.resample('D', on='date').sum()['sales']

In [47]:
try:
    model = ExponentialSmoothing(daily_sales, trend='add', seasonal='add', seasonal_periods=7).fit()
    models[name] = model
    forecast = model.forecast(steps=10)
    results[name] = forecast



SyntaxError: incomplete input (<ipython-input-47-82448b9e84c5>, line 7)

In [12]:
if 'city1' in models:
    print(models['city1'].summary())
    print(results['city1'])
else:
    print("City 'city1' not found in the dataset")

City 'city1' not found in the dataset


In [13]:
def test_single_city(city_name):
    if city_name in df['city'].unique():
        city_data = df[df['city'] == city_name]
        city_data = city_data.sort_values(by='date')
        city_data['sales'] = city_data['price'] * city_data['quantity']
        daily_sales = city_data.resample('D', on='date').sum()['sales']
        model = ExponentialSmoothing(daily_sales, trend='add', seasonal='add', seasonal_periods=7).fit()
        forecast = model.forecast(steps=10)
        assert len(forecast) == 10
    else:
        print(f"City '{city_name}' not found in the dataset")

test_single_city('city1')

City 'city1' not found in the dataset


In [48]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np

# Define a function for time series cross-validation
def time_series_cv(data, n_splits):
    # Create a TimeSeriesSplit object
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Initialize a list to store the errors
    errors = []

    # Loop over the splits
    for train_index, test_index in tscv.split(data):
        # Split the data into training and test sets
        train, test = data.iloc[train_index], data.iloc[test_index]

        # Fit the Exponential Smoothing model
        model = ExponentialSmoothing(train, trend='add', seasonal='add', seasonal_periods=7).fit()

        # Forecast the test set
        forecast = model.forecast(len(test))

        # Calculate the mean squared error
        error = mean_squared_error(test, forecast)

        # Append the error to the list
        errors.append(error)

    # Return the average error
    return np.mean(errors)

# Example usage:
# Assuming 'data' is a pandas Series with your time series data
# data = pd.Series([...])
# print(time_series_cv(data, 5))



In [22]:
if 'city1' in df['city'].unique():
    city_data = df[df['city'] == 'city1']
    city_data = city_data.sort_values(by='date')
    city_data['sales'] = city_data['price'] * city_data['quantity']
    daily_sales = city_data.resample('D', on='date').sum()['sales'].values
    cv_error = time_series_cv(daily_sales, n_splits=5)
    print(f'Cross-Validation Error: {cv_error}')
else:
    print("City 'city1' not found in the dataset")


City 'city1' not found in the dataset


In [24]:
performance = {}
for name, group in groups:
    group = group.sort_values(by='date')
    group['sales'] = group['price'] * group['quantity']
    daily_sales = group.resample('D', on='date').sum()['sales']
    model = models.get(name)
    if model:
        forecast = results[name]
        true_values = daily_sales[-len(forecast):]  # Assuming the last 'len(forecast)' entries are the test set
        mse = mean_squared_error(true_values, forecast)
        performance[name] = mse
        print(performance)

In [25]:
def plot_forecast(city_name):
    if city_name in groups.groups:
        group = groups.get_group(city_name).sort_values(by='date')
        group['sales'] = group['price'] * group['quantity']
        daily_sales = group.resample('D', on='date').sum()['sales']
        model = models.get(city_name)
        if model:
            forecast = results[city_name]
            plt.figure(figsize=(10, 6))
            plt.plot(daily_sales, label='True Values')
            plt.plot(forecast, label='Forecast', linestyle='--')
            plt.title(f'Forecast vs True Values for {city_name}')
            plt.legend()
            plt.show()
        else:
            print(f"No model found for {city_name}")
    else:
        print(f"City '{city_name}' not found in the dataset")

In [26]:
summary = pd.DataFrame.from_dict(performance, orient='index', columns=['MSE'])
print(summary)

Empty DataFrame
Columns: [MSE]
Index: []
