# Megatutorial 04: Regression

In [71]:
from pandas import read_csv

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

---
# Daten Laden

In [72]:
data = read_csv("../data/bikesharing.csv",index_col=0)

---
# Data Preprocessing
---

## Imputing

In [73]:
imputer = SimpleImputer(strategy="median")
imputer.fit(data[["hum"]])                         #  Eckige Klammer zum slicen - innere Klammer - Listenelement - deshalb DataFrame und nicht Series
data["hum"] = imputer.transform(data[["hum"]])     # Daten einfügen

## Encoding

In [74]:
season_encoder = LabelEncoder()
data["season"] = season_encoder.fit_transform(data["season"])

# holiday
holiday_encoder = LabelEncoder()
data["holiday"] = holiday_encoder.fit_transform(data["holiday"])

# weekday
weekday_encoder = LabelEncoder()
data["weekday"] = weekday_encoder.fit_transform(data["weekday"])

# workingday
workingday_encoder = LabelEncoder()
data["workingday"] = workingday_encoder.fit_transform(data["workingday"])

# weathersit
weathersit_encoder = LabelEncoder()
data["weathersit"] = weathersit_encoder.fit_transform(data["weathersit"])

## Feature/Target Split

In [75]:
data.columns

Index(['season', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp',
       'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt', 'day',
       'month', 'year'],
      dtype='object')

In [76]:
# data.columns verwenden um Attribute herauszuschreiben

features = [
    'season', 'holiday', 'weekday', 'weathersit', 'temp',
    'atemp', 'hum', 'windspeed', 'month'
]

target = ['cnt']

X = data[features]
y = data[target]

## Train/Test Split

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=875)

## Modelling

In [83]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

predicitions = linear_model.predict(X_test)

print(
    "R² = ", r2_score(y_test, predicitions),
    ", RMSE = ", root_mean_squared_error(y_test, predicitions),
    ", MAE = ", mean_absolute_error(y_test, predicitions)
)

R² =  0.5405958921616075 , RMSE =  1373.1227620068084 , MAE =  1163.1284232192186


## Decision Tree Regressor

In [88]:
tree_model = DecisionTreeRegressor(max_depth=5)
tree_model.fit(X_train, y_train)

predicitions = tree_model.predict(X_test)

print(
    "R² = ", r2_score(y_test, predicitions),
    ", RMSE = ", root_mean_squared_error(y_test, predicitions),
    ", MAE = ", mean_absolute_error(y_test, predicitions)
)

R² =  0.5559437170404367 , RMSE =  1349.9912044826604 , MAE =  1122.168289469677
