In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime, timedelta

def week_year_to_date(year, week):
    starting_date = datetime(int(year), 1, 1)
    return starting_date + timedelta(int(week)*7-7)

In [6]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [7]:
df_grouped = df.groupby(['longitude', 'latitude'])
lst = []
for name, group_df in df_grouped:
    group_df['rolling_emission3'] = group_df['emission'].rolling(window = 3, center = True).mean()
    group_df['rolling_emission5'] = group_df['emission'].rolling(window = 5, center = True).mean()
    group_df['rolling_emission7'] = group_df['emission'].rolling(window = 7, center = True).mean()
    group_df['rolling_emission9'] = group_df['emission'].rolling(window = 9, center = True).mean()
    group_df = group_df.interpolate(method='linear', limit_direction='both')
    lst.append(group_df)
df = pd.concat(lst)

last_year_df = (df
                .assign(year=df['year'] + 1)
                .loc[:, ['year', 'week_no', 'longitude', 'latitude', 'emission', 'rolling_emission3', 'rolling_emission5', 'rolling_emission7','rolling_emission9']]
                .rename(columns={'emission': 'last_year_emission'
                                 , 'rolling_emission3': 'ly_rolling_emission3'
                                 , 'rolling_emission5': 'ly_rolling_emission5'
                                 , 'rolling_emission7': 'ly_rolling_emission7'
                                 , 'rolling_emission9': 'ly_rolling_emission9'}))
df = pd.merge(df, last_year_df, on = ['year', 'week_no', 'longitude', 'latitude'], how = 'left')
df['date'] =  df.apply(lambda row : week_year_to_date(row.year, row.week_no), axis = 1)

In [8]:
date_start = datetime(2020, 3, 1)
date_end = datetime(2020, 9, 30)
df = (df
      .query("year != 2019")
      .query("date < @date_start or date > @date_end")
      .loc[:, ['year'
               , 'week_no'
               , 'last_year_emission'
               , 'ly_rolling_emission3'
               , 'ly_rolling_emission5'
               , 'ly_rolling_emission7'
               , 'ly_rolling_emission9'
               , 'emission']])

In [None]:
x_train = df.drop(columns = ['emission'])
x_test = df_test 
y_train = df.emission
model = XGBRegressor()
model.fit(x_train, y_train)

Design x_test

In [14]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [15]:
df_grouped = df.groupby(['longitude', 'latitude'])
lst = []
for name, group_df in df_grouped:
    group_df['rolling_emission3'] = group_df['emission'].rolling(window = 3, center = True).mean()
    group_df['rolling_emission5'] = group_df['emission'].rolling(window = 5, center = True).mean()
    group_df['rolling_emission7'] = group_df['emission'].rolling(window = 7, center = True).mean()
    group_df['rolling_emission9'] = group_df['emission'].rolling(window = 9, center = True).mean()
    group_df = group_df.interpolate(method='linear', limit_direction='both')
    lst.append(group_df)
df = pd.concat(lst)

last_year_df = (df
                .assign(year=df['year'] + 1)
                .loc[:, ['year', 'week_no', 'longitude', 'latitude', 'emission', 'rolling_emission3', 'rolling_emission5', 'rolling_emission7','rolling_emission9']]
                .rename(columns={'emission': 'last_year_emission'
                                 , 'rolling_emission3': 'ly_rolling_emission3'
                                 , 'rolling_emission5': 'ly_rolling_emission5'
                                 , 'rolling_emission7': 'ly_rolling_emission7'
                                 , 'rolling_emission9': 'ly_rolling_emission9'}))
df_test = pd.merge(df_test, last_year_df, on = ['year', 'week_no', 'longitude', 'latitude'], how = 'left')
df_test['date'] =  df_test.apply(lambda row : week_year_to_date(row.year, row.week_no), axis = 1)

date_start = datetime(2020, 3, 1)
date_end = datetime(2020, 9, 30)
df_test = (df_test
      .query("year != 2019")
      .query("date < @date_start or date > @date_end")
      .loc[:, ['year'
               , 'week_no'
               , 'latitude' 
               , 'longitude'
               , 'last_year_emission'
               , 'ly_rolling_emission3'
               , 'ly_rolling_emission5'
               , 'ly_rolling_emission7'
               , 'ly_rolling_emission9']])


In [16]:

y_pred = model.predict(df_test.drop(columns=['latitude', 'longitude']))
y_pred = pd.DataFrame({'prediction': y_pred})
df_test['emission'] = y_pred
final_result = df_test

In [18]:
id_table = (pd.read_csv('test.csv')
            .loc[:, ['longitude', 'latitude', 'year', 'week_no', 'ID_LAT_LON_YEAR_WEEK']])
export = (pd.merge(final_result, id_table, on = ['year', 'week_no', 'longitude', 'latitude'], how = 'left')
            .loc[:, ['ID_LAT_LON_YEAR_WEEK', 'emission']])
export.to_csv('submission_3.csv', index=False)