#### Random Forest Regression model with selected Data

#### Initialization

In [1]:
# Import main libraries for data analysis and modelling
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import additional helper libraries
import os


In [2]:
# Define the filepath

data_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + '/data/'

train_csv = data_dir + 'train_data.csv'
test_csv = data_dir + 'test_data.csv'

print(train_csv)
print(test_csv)

c:\Sam\wids\WiDS-datathon-2023/data/train_data.csv
c:\Sam\wids\WiDS-datathon-2023/data/test_data.csv


In [3]:
# Load the training data set
train_df = pd.read_csv(train_csv)

No Analysis will be done on the data as this has already been covered in `exhaustive_data_analysis.ipynb`

In [4]:
# Find the target column
target_feature = 'contest-tmp2m-14d__tmp2m'

#### Data Preparation

In [5]:
# Convert startdate from object to various usable types. Month of the year has more impact on weather so Year, Month and Day will be separated
train_df['startdate'] = pd.to_datetime(train_df['startdate'], format='%m/%d/%y')
train_df['startdate_ordinal'] = train_df['startdate'].apply(lambda x:x.toordinal())
train_df['year'] = train_df['startdate'].dt.year
train_df['month'] = train_df['startdate'].dt.month
train_df['dayofyear'] = train_df['startdate'].dt.day_of_year

In [6]:
# Convert Climate regions to string and dummy numerical data for processing
train_df['climateregions__climateregion'] = train_df['climateregions__climateregion'].astype(str)
train_df['climateregions_num'] = LabelEncoder().fit_transform(train_df['climateregions__climateregion'])

In [7]:
# Impute null/nan features
null_features = train_df.columns[train_df.isnull().any()]

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(train_df[null_features])

train_df[null_features] = imputer.transform(train_df[null_features])
print(f'Columns with null vaules in Training data are {train_df.columns[train_df.isnull().any()]}')
print(f'Columns with null vaules in Training data are {train_df.columns[train_df.isna().any()]}')

Columns with null vaules in Training data are Index([], dtype='object')
Columns with null vaules in Training data are Index([], dtype='object')


In [8]:
# features that are non-numeric
nonum_features = ['startdate','startdate_ordinal','climateregions__climateregion','lat','lon','index']

# features that are predictions from other models
all_features = list(train_df.columns)
predict_prefix = ('nmme','cancm','ccsm','cfsv20','gfdl','nasa')

predict_features = []
for f in all_features:
    if f.startswith(predict_prefix):
        predict_features.append(f)
# print(predict_features)

d2010_features = [i for i in all_features if '-2010-' in i]
# print(d2010_features)

extra_features_drop = ['year','dayofyear','elevation__elevation','contest-pres-sfc-gauss-14d__pres','contest-precip-14d__precip','contest-wind-h100-14d__wind-hgt-100','contest-wind-h10-14d__wind-hgt-10','contest-wind-h850-14d__wind-hgt-850','contest-wind-uwnd-250-14d__wind-uwnd-250','contest-wind-vwnd-250-14d__wind-vwnd-250','mei__mei', 'mei__meirank', 'mei__nip', 'mjo1d__amplitude', 'mjo1d__phase']
extra_features_add = ['icec-2010-','sst-2010-1']

In [9]:
# Drop features not selected for this model
features_to_drop = nonum_features + predict_features + d2010_features + extra_features_drop
features_to_drop = [i for i in features_to_drop if i not in extra_features_add]

X = train_df.copy()
X = X.drop(features_to_drop, axis=1)
y = train_df[target_feature].copy()

print(X.columns)

Index(['contest-pevpr-sfc-gauss-14d__pevpr', 'contest-rhum-sig995-14d__rhum',
       'contest-tmp2m-14d__tmp2m', 'contest-slp-14d__slp',
       'contest-wind-vwnd-925-14d__wind-vwnd-925',
       'contest-prwtr-eatm-14d__prwtr',
       'contest-wind-uwnd-925-14d__wind-uwnd-925',
       'contest-wind-h500-14d__wind-hgt-500', 'sst-2010-1', 'month',
       'climateregions_num'],
      dtype='object')


#### Modelling

In [10]:
# Split data in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, train_size=0.8, shuffle=True)

In [11]:
# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

In [12]:
# Check results
print(model.score(X_test, y_test))

0.9999999891778538


In [13]:
# Predictions
pred = model.predict(X_test)

In [14]:
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, pred, squared=False))
print('mean_absolute_error : ', mean_absolute_error(y_test, pred))
print('R2 score : ', r2_score(y_test, pred))

mean_squared_error :  0.0010301513037367894
mean_absolute_error :  0.0001405010153404143
R2 score :  0.9999999891778538
