In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_regression
from scipy.io import arff
import statsmodels.api as sm


def load_real_data(aoi):
    df = pd.read_csv('dataset.csv', sep=';')
    df = df.replace([np.inf, -np.inf], np.nan)
    
    df = df.loc[df.AOI == aoi]
    df['Date'] = pd.to_datetime(df.Date, format='%Y%m%d')
    df = df.sort_values(by='Date')
    
    df = df.set_index(df.Date)
    
    df = df.fillna(method='pad')
    
    df = df.drop(['AOI', 'Date'], axis=1)
    df = df.rename(columns = {'Gt':'y'})
    df['y'] = df['y'].diff()
    df['y'] = df['y'].fillna(0)
    
    return df


def get_data_in_window(raw_df, t0, window_size):
    columns = list(raw_df.columns)
    columns.remove('y')
    t_columns = []
    x_t = []
    for dt in range(window_size):
        t_columns.append(['t-{}_{}'.format(dt, col_name) for col_name in columns])
        x_t.append(raw_df.drop('y', axis=1).iloc[t0-dt].values)
        
    x = np.concatenate(x_t)
    y = raw_df['y'].iloc[t0]
    data = np.hstack([x, y]).reshape(1, -1)
    columns = [item for sublist in t_columns for item in sublist] + ['y']
    return pd.DataFrame(data, columns=columns)

def format_windowed_data(raw_df, t_start, t_end, window_size):
    data = []
    for t in range(t_start, t_end):
        df_x_t = get_data_in_window(raw_df, t, window_size)
        x_t = df_x_t.values.ravel()
        columns = df_x_t.columns
        data.append(x_t)
    return pd.DataFrame(data, columns=columns)

In [130]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
import math
import seaborn as sns

def plot_feature_importances(feature_importances, columns):
    asd = pd.Series(feature_importances, columns)
    asd.sort_values(ascending=False, inplace=True)
    plt.figure(figsize=(12,40))
    sns.barplot(x=asd.values, y=asd.index)
    plt.title('Feature Importance')
    plt.show()

def do_everything(aoi):
    df_raw = load_real_data(aoi)
    n = len(df_raw)
    df_train = format_windowed_data(df_raw, 13, n-14, window_size=14)
    df_test = format_windowed_data(df_raw, n-14, n, window_size=14)
    
    X_train, y_train = df_train.drop('y', axis=1).values, df_train['y'].values
    X_test, y_test = df_test.drop('y', axis=1).values, df_test['y'].values

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    def mae(y_true, y_pred):
        abs_errors = [math.fabs(x - y) for x, y in zip(y_true, y_pred)]
        return np.mean(np.array(abs_errors))

    y_test_pred = model.predict(X_test)
    performance = mae(y_test, y_test_pred)
    print('MAE:', performance)

    #plt.figure(figsize=(20,10))
    #plt.plot(np.arange(y_train.shape[0]), y_train)
    #plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+14), y_test)
    #plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+14), y_test_pred)
    #plt.show()
    
    #plot_feature_importances(model.feature_importances_, df_train.drop('y', axis=1).columns)
    
    #pred_columns = ['AOI'] + ['Date'] + list(np.arange(y_test_pred.shape[0]))
    #print(df_test.index)
    #data = [aoi, df_test.index , y_test_pred]
    #df_pred = pd.DataFrame(data, columns=pred_columns)
    #print(df_pred)
    
    #pred_data = np.array(aoi) + date + y_test_pred
    
    df_feat = pd.DataFrame([aoi] + [model.feature_importances_], columns=['AOI'] + [df_test.drop('y', axis=1).columns])
    
    #return df_pred, df_feat
    return None, df_feat

In [131]:
df_pred, df_feat = do_everything('New York')

MAE: 302.2778571428571


ValueError: Shape of passed values is (2, 1), indices imply (2, 2)

In [127]:
df_feat

Unnamed: 0,t-0_no2Mean,t-0_no2Std,t-0_no2Min,t-0_no2Max,t-0_no2Median,t-0_ozoneMean,t-0_ozoneStd,t-0_ozoneMin,t-0_ozoneMax,t-0_ozoneMedian,...,t-13_ozoneMean,t-13_ozoneStd,t-13_ozoneMin,t-13_ozoneMax,t-13_ozoneMedian,t-13_aerosolMean,t-13_aerosolStd,t-13_aerosolMin,t-13_aerosolMax,t-13_aerosolMedian
0,0.000279,0.007313,8.8e-05,0.0227,1.2e-05,0.0,0.003627,4.6e-05,0.004661,0.00618,...,0.000666,0.001264,0.002919,0.000969,0.001794,0.001577,0.001177,0.0,0.00232,0.0


In [30]:
df = pd.read_csv('dataset.csv', sep=';')
aois = list(df.AOI.unique())
for aoi in aois:
    df_pred, df_feat = do_everything(aoi)
    

array(['Georgia', 'Delaware', 'Connecticut', 'Pennsylvania', 'Maine',
       'New Hampshire', 'Massachusetts', 'New York', 'Rhode Island',
       'New Jersey', 'District of Columbia', 'Maryland', 'Virginia',
       'North Carolina', 'South Carolina', 'Florida'], dtype=object)

In [134]:
def do(aoi):
    col = ["AOI", "date","pred"]
    df_tmp = pd.DataFrame(columns=col)
    col2 = ['AOI',]
    df_raw = load_real_data(aoi)
    n = len(df_raw)
    df_train = format_windowed_data(df_raw, 13, n-14, window_size=14)
    df_test = format_windowed_data(df_raw, n-14, n, window_size=14)
    X_train, y_train = df_train.drop('y', axis=1).values, df_train['y'].values
    X_test, y_test = df_test.drop('y', axis=1).values, df_test['y'].values
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    def mae(y_true, y_pred):
        abs_errors = [math.fabs(x - y) for x, y in zip(y_true, y_pred)]
        return np.mean(np.array(abs_errors))
    y_test_pred = model.predict(X_test)
    performance = mae(y_test, y_test_pred)
    print('MAE:', performance)
    #plt.figure(figsize=(20,10))
    #plt.plot(np.arange(y_train.shape[0]), y_train)
    #plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+14), y_test)
    #plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+14), y_test_pred)
    #plt.show()
    #plot_feature_importances(model.feature_importances_, df_train.drop('y', axis=1).columns)
    pred_columns = ['AOI'] + ['Date'] + list(np.arange(y_test_pred.shape[0]))
    for i in range(len(y_test_pred)):
      val = [aoi,  df_raw.iloc[-14:].index[i].strftime('%Y%m%d'),y_test_pred[i]]
      df_tmp.loc[len(df_tmp)] = val

    return df_tmp

In [135]:
do('New York')

MAE: 302.2778571428571


Unnamed: 0,AOI,date,pred
0,New York,20200516,1524.65
1,New York,20200517,1889.51
2,New York,20200518,1661.09
3,New York,20200519,1826.39
4,New York,20200520,1781.94
5,New York,20200521,1754.89
6,New York,20200522,1505.68
7,New York,20200523,1648.25
8,New York,20200524,1537.07
9,New York,20200525,1593.78
