In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

def get_num_folds(cv_percent=20):
    import math
    return math.floor(X.shape[0] / (X.shape[0] * (cv_percent / 100)))

### Data preprocessing

This dataset contains few non-numerical features. Here we restructure cathegorical data into boolean-valued columns to be able to include them into model in the numerical form

In [3]:
df = pd.read_csv('forestfires.csv')
df.dropna()

df = pd.get_dummies(df)

### Preparing the subsets

In [4]:
X = df.drop(["X", "Y", "area"], axis=1).values
y = df["area"].values

scaler = RobustScaler()
X = scaler.fit_transform(X)

For later interpretation of the model performance, we split dataset into three subsets: <br>
   1 - train <br>
   2 - cross-validation [20%] <br>
   3 - test [20%] <br>

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

### Training regression model

Train a Linear Regression model and perform regression analysis using the coefficient of determination ('r2').  

In [24]:
from sklearn.model_selection import KFold

lr = LinearRegression()
kf = KFold(n_splits=5, random_state=None, shuffle=False) 

scores = []
for train_index, test_index in kf.split(X):
    X_tr, X_cv = X[train_index], X[test_index]
    y_tr, y_cv = y[train_index], y[test_index]
    
    lr.fit(X_tr, y_tr)
    y_pred = lr.predict(X_cv)
    scores.append(r2_score(y_cv, y_pred, multioutput='variance_weighted'))    

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("r2 scores for c-v set: \n\t {}; \n\t mean: {}".format(scores, np.mean(scores)))
print("Mean r2 score for test set: {}".format(r2_score(y_test, y_pred, multioutput='variance_weighted')))

r2 scores for c-v set: 
	 [0.0, -23.201428061357053, -0.06549228965656928, -0.36743826067068475, -0.06306239239599809]; 
	 mean: -4.739484200816061
Mean r2 score for test set: -0.06306239239599809
