In [1]:
# imports
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [2]:
# read data
dtype = {'ID_LAT_LON_YEAR_WEEK':'string',
         'latitude': 'string',
         'longitude': 'string',
         'year': 'int',
         'week_no': 'int',
         'emission': 'float'}
df = pd.read_csv('files/train.csv', dtype=dtype)
# df

In [3]:
# prepare data
df = df[['ID_LAT_LON_YEAR_WEEK', 'year', 'week_no', 'emission']]
df['id'] = df['ID_LAT_LON_YEAR_WEEK'].str[:16]
day_of_week = {2019:2, 2020:3, 2021:5, 2022:6, 2023:0}
df.loc[:, 'day_of_week'] = df['year'].map(day_of_week)
df.loc[:, 'date'] = df['year'].astype('string') + '-' + df['week_no'].astype('string') + '-' + df['day_of_week'].astype('string')
df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y-%W-%w')
df = df[['id', 'date', 'emission']]
# df

In [4]:
# function to create model and forecast
def arima(data, endog_name='emission', steps=49):
    arima = SARIMAX(endog = df_id.loc[:, endog_name].values,
                    order = (1, 0, 0),
                    dates = df_id.index.values,
                    freq = 'W')
    arima = arima.fit(full_output = False,
                      disp = False)
    return arima.forecast(steps=steps)

In [5]:
%%time
# run arima function for every location

groups = df.groupby('id')
results = {}

for name, group in groups:
    df_id = group.drop(columns='id').set_index('date', drop=True).resample('W').nearest()
    results[name] = arima(df_id)
df_results = pd.DataFrame(results).T.reset_index()
# df_results

CPU times: total: 20.7 s
Wall time: 20.7 s


In [6]:
# format output
df_results = df_results.melt(id_vars=['index'], value_vars=range(49))
df_results['id'] = df_results['index'] + '_2022_' + df_results['variable'].astype('string').str.zfill(2)
df_results = df_results[['id', 'value']]
df_results = df_results.sort_values('id').reset_index(drop=True)
df_results.columns = ['ID_LAT_LON_YEAR_WEEK', 'emission']
df_results

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-0.510_29.290_2022_00,4.266863
1,ID_-0.510_29.290_2022_01,4.264218
2,ID_-0.510_29.290_2022_02,4.261574
3,ID_-0.510_29.290_2022_03,4.258932
4,ID_-0.510_29.290_2022_04,4.256292
...,...,...
24348,ID_-3.299_30.301_2022_44,25.822472
24349,ID_-3.299_30.301_2022_45,25.791839
24350,ID_-3.299_30.301_2022_46,25.761242
24351,ID_-3.299_30.301_2022_47,25.730681


In [7]:
# save output to csv
df_results.to_csv('output/arima.csv', index=False)