# Megaturorial 04: Regression

In [52]:
from pandas import read_csv

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error

## Daten laden

In [11]:
data = read_csv("../data/bikesharing(1).csv", index_col=0)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    object 
 1   holiday     731 non-null    object 
 2   weekday     731 non-null    object 
 3   workingday  731 non-null    object 
 4   weathersit  731 non-null    object 
 5   temp        731 non-null    float64
 6   atemp       731 non-null    float64
 7   hum         698 non-null    float64
 8   windspeed   731 non-null    float64
 9   casual      731 non-null    int64  
 10  registered  731 non-null    int64  
 11  cnt         731 non-null    int64  
 12  day         731 non-null    int64  
 13  month       731 non-null    int64  
 14  year        731 non-null    int64  
dtypes: float64(4), int64(6), object(5)
memory usage: 91.4+ KB


# Data Preprocessing

## Imputing

In [20]:
imputer = SimpleImputer(strategy="median")
imputer.fit(data[["hum"]])
data["hum"] = imputer.transform(data[["hum"]])

## Encoding

In [None]:
season_encoder = LabelEncoder()
data["season"] = season_encoder.fit_transform(data["season"])

holiday_encoder = LabelEncoder()
data["holiday"] = holiday_encoder.fit_transform(data["holiday"])

weekday_encoder = LabelEncoder()
data["weekday"] = weekday_encoder.fit_transform(data["weekday"])

workingday_encoder = LabelEncoder()
data["workingday"] = workingday_encoder.fit_transform(data["workingday"])

weathersit_encoder = LabelEncoder()
data["weathersit"] = weathersit_encoder.fit_transform(data["weathersit"])


## Feature/Target/Split

In [29]:
data.columns

Index(['season', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp',
       'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt', 'day',
       'month', 'year'],
      dtype='object')

In [34]:
features = [
    'season', 'holiday', 'weekday', 'weathersit', 'temp',
    'atemp', 'hum', 'windspeed', 'month'
    ]

target = ['cnt']

x = data[features]
y = data[target]

## Train/Test/Split

In [66]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=875)

## Modelling

## Lineare Regression

In [69]:
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

predictions = linear_model.predict(x_test)

print(
    "R²", r2_score(y_test, predictions),
    ", RMSE", root_mean_squared_error(y_test, predictions),
    ", MAE", mean_absolute_error(y_test, predictions),
)

R² 0.5405958921616075 , RMSE 1373.1227620068084 , MAE 1163.1284232192186


## Decision Tree Regresssor

In [78]:
tree_model = DecisionTreeRegressor(max_depth=4)
tree_model.fit(x_train, y_train)

predictions = tree_model.predict(x_test)

print(
    "R²", r2_score(y_test, predictions),
    ", RMSE", root_mean_squared_error(y_test, predictions),
    ", MAE", mean_absolute_error(y_test, predictions),
)

R² 0.5545543435155273 , RMSE 1352.1014968746574 , MAE 1129.6405799590682
