#### Basic Regression model with selected Data

#### Initialization

In [None]:
# Import main libraries for data analysis and modelling
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d

# Import additional helper libraries
import os
from IPython.display import display


In [None]:
# Define the filepath

data_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + '/data/'

train_csv = data_dir + 'train_data.csv'
test_csv = data_dir + 'test_data.csv'

print(train_csv)
print(test_csv)

In [None]:
# Load the training data set
train_df = pd.read_csv(train_csv)

No Analysis will be done on the data as this has already been covered in `exhaustive_data_analysis.ipynb`

In [None]:
# Find the target column
target_feature = 'contest-tmp2m-14d__tmp2m'

#### Data Preparation

In [None]:
# Convert startdate from object to various usable types. Month of the year has more impact on weather so Year, Month and Day will be separated
train_df['startdate'] = pd.to_datetime(train_df['startdate'], format='%m/%d/%y')
train_df['startdate_ordinal'] = train_df['startdate'].apply(lambda x:x.toordinal())
train_df['year'] = train_df['startdate'].dt.year
train_df['month'] = train_df['startdate'].dt.month
train_df['dayofyear'] = train_df['startdate'].dt.day_of_year

In [None]:
# Convert Climate regions to string and dummy numerical data for processing
train_df['climateregions__climateregion'] = train_df['climateregions__climateregion'].astype(str)
train_df['climateregions_num'] = LabelEncoder().fit_transform(train_df['climateregions__climateregion'])

In [None]:
# Impute null/nan features
null_features = train_df.columns[train_df.isnull().any()]

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(train_df[null_features])

train_df[null_features] = imputer.transform(train_df[null_features])
print(f'Columns with null vaules in Training data are {train_df.columns[train_df.isnull().any()]}')
print(f'Columns with null vaules in Training data are {train_df.columns[train_df.isna().any()]}')

In [None]:
# features that are non-numeric
nonum_features = ['startdate','startdate_ordinal','climateregions__climateregion','lat','lon','index']

# features that are predictions from other models
all_features = list(train_df.columns)
predict_prefix = ('nmme','cancm','ccsm','cfsv20','gfdl','nasa')

predict_features = []
for f in all_features:
    if f.startswith(predict_prefix):
        predict_features.append(f)
# print(predict_features)

d2010_features = [i for i in all_features if '-2010-' in i]
# print(d2010_features)

In [None]:
# Drop features not selected for this model
features_to_drop = nonum_features + predict_features + d2010_features
features_to_drop.remove('sst-2010-1')
features_to_drop.remove('icec-2010-1')
# print(features_to_drop)

X = train_df.copy()
X = X.drop(features_to_drop, axis=1)
y = train_df[target_feature].copy()

#### Modelling

In [None]:
# Split data in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=104, train_size=0.8, shuffle=True)

In [None]:
# Train the model
model = LinearRegression().fit(X_train, y_train)

In [None]:
# Check results
print(model.score(X_test, y_test))

In [None]:
# Predictions
pred = model.predict(X_test)

In [None]:
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, pred))
print('R2 score : ', r2_score(y_test, pred))