In [27]:
from pathlib import Path
import pandas as pd
import numpy as np
import shogun as sg
from scipy.stats import skew
from scipy import stats
import util
import process
# from process import train_leo_johnson

Let's consider Italy first

In [28]:
from pathlib import Path

path = Path.cwd()
cleaned_data_path = path.parent / 'data' / 'cleaned'

countries = ['austria', 'belgium', 'germany', 'italy', 'netherlands']

df = {}
for country in countries :
    file_path = cleaned_data_path / (country+'.csv')
    df[country] = pd.read_csv(file_path)

country = 'italy'

In [29]:
process.add_polynomial_features(country, df, 10)

In [30]:

numerical_features = df[country].select_dtypes(exclude=["object"]).columns
numerical_features = numerical_features.drop('incidence')

In [31]:
process.hot_encode_weeks(country, df)
df_italy = df[country]
df[country] = df[country].drop(columns=['week', 'date'])

Partition the dataset in train + validation sets

In [32]:
train = {'italy': df[country].sample(frac=0.8,random_state=200)}
test = {'italy': df[country].drop(train[country].index)}
# train[country]=df[country].sample(frac=0.8,random_state=200)
# test[country]=df[country].drop(train.index)
train_italy = train['italy']

y_train = pd.Series(train[country]['incidence']).to_frame('incidence')
y_test = pd.Series(test[country]['incidence']).to_frame('incidence')
X_train = train[country].drop(columns=['incidence'])
X_test = test[country].drop(columns=['incidence'])

print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

X_train : (250, 439)
X_test : (62, 439)
y_train : (250, 1)
y_test : (62, 1)


now we will apply our transformations.

In [33]:
lmbda = {'austria'    : {}, 'belgium': {}, 'germany': {}, 'italy': {},
             'netherlands': {}}

# skewness of >0.5 can be considered to be moderately skewed.
skewness = X_train[numerical_features].apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
skewness

Acrocianosi                            0.501738
Acroosteolisi                          1.703032
Adiadococinesia                        1.097576
Alfuy_virus                            6.839026
Alitosi                                1.746707
Allucinazione_uditiva                  1.616044
Anatossina                             0.723457
Angioma_stellare                       1.309031
Anossia                                0.729909
Apoi_virus                             2.429184
Arco_senile                            1.120637
Aroa_virus                             7.438045
Asterissi                              1.187344
Bacillo_di_Calmette-Guérin             1.351822
Bagaza_virus                          10.739271
Baiyangdian_virus                      8.655033
Banzi_virus                            3.282595
Brina_uremica                          0.636195
Bronchite                              1.246843
Broncorrea                             0.550191
Broncospasmo                           0

In [34]:
skewed_features = skewness.index
skewed_features

Index(['Acrocianosi', 'Acroosteolisi', 'Adiadococinesia', 'Alfuy_virus',
       'Alitosi', 'Allucinazione_uditiva', 'Anatossina', 'Angioma_stellare',
       'Anossia', 'Apoi_virus',
       ...
       'Virus_adeno-associati-s3', 'Anossia-s2', 'Anossia-s3', 'Anossia-sq',
       'Febbre_gialla-s2', 'Febbre_gialla-s3', 'Febbre_gialla-sq',
       'Soffio_di_Flint-s2', 'Soffio_di_Flint-s3', 'Soffio_di_Flint-sq'],
      dtype='object', length=312)

In [35]:
def train_leo_johnson(df, lmbda, skewed_features):
    for feature in skewed_features:
        column = df[feature] + 1
        column, lmbda[feature] = stats.boxcox(column)
        column = pd.Series(column)
        df[feature] = column

In [36]:
process.train_leo_johnson(X_train, lmbda[country], skewed_features)
process.train_leo_johnson(y_train, lmbda[country], ['incidence'])

In [37]:
def train_std_normal(df, numerical_features, means, std_deviations):
    for feature in numerical_features:
        column = df[feature]
        means[feature] = column.mean()
        std_deviations[feature] = column.std()
        column -= means[feature]
        column /= std_deviations[feature]
        df[feature] = column

In [38]:
means = {}
std_deviations = {}
process.train_std_normal(X_train, numerical_features, means, std_deviations)

In [39]:
# we apply the Yeo Johnson Transformation.
def apply_leo_johnson(df, lmbda, skewed_features):
    for feature in skewed_features:
        column = df[feature] + 1
        column = stats.boxcox(column, lmbda=lmbda[feature])
        column = pd.Series(column)
        df[feature] = column

In [40]:
process.apply_leo_johnson(X_test, lmbda[country], skewed_features)
process.apply_leo_johnson(y_test, lmbda[country], ['incidence'])

In [41]:
# we transform features so that mean is 0 and std deviation is 1.
def apply_std_normal(df, numerical_features, means, std_deviations):
    for feature in numerical_features:
        column = df[feature]
        column -= means[feature]
        column /= std_deviations[feature]
        df[feature] = column

In [42]:
process.apply_std_normal(X_test, numerical_features, means, std_deviations)

Transformations have been applied, and data is ready.

applying Linear Regression without regularization

In [43]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)
y_test = y_test.fillna(0)

In [44]:
final_data_path = path.parent / 'data' / 'test'
final_file_path = final_data_path / 'X_train.csv'
X_train.to_csv(final_file_path, index=False)

f_feats_train = sg.read_csv(str(final_file_path.absolute()))

features_train = sg.create_features(f_feats_train)

In [45]:
final_data_path = path.parent / 'data' / 'test'
final_file_path = final_data_path / 'X_test.csv'
X_test.to_csv(final_file_path, index=False)

f_feats_test = sg.read_csv(str(final_file_path.absolute()))

features_test = sg.create_features(f_feats_test)

In [46]:
final_data_path = path.parent / 'data' / 'test'
final_file_path = final_data_path / 'y_train.csv'
y_train.to_csv(final_file_path, index=False)

f_labels_train = sg.read_csv(str(final_file_path.absolute()))

labels_train = sg.create_labels(f_labels_train)

In [47]:
final_data_path = path.parent / 'data' / 'test'
final_file_path = final_data_path / 'y_test.csv'
y_test.to_csv(final_file_path, index=False)

f_labels_test = sg.read_csv(str(final_file_path.absolute()))

labels_test = sg.create_labels(f_labels_test)

In [48]:
lrr = sg.create_machine("LinearRidgeRegression", tau=0.001, labels=labels_train, use_bias=False)

In [49]:
lrr.train(features_train)

True

In [50]:
labels_predict = lrr.apply(features_test)
labels_predict

RegressionLabels(num_subscriptions={function}, subset_stack=SubsetStack(...), current_values=Vector<0> ), labels=Vector<63>( -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan, -nan ), num_labels={function})

In [51]:
b = lrr.get("bias")
w = lrr.get("w")
w

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [52]:
evaluation = sg.create_evaluation("MeanSquaredError")
mse = evaluation.evaluate(labels_predict, labels_test)
mse

nan