# Demonstration der Aufteilung

In [9]:
import pandas as pd

df = pd.read_csv("./hotels.csv")[:6]

df

Unnamed: 0,Gewinn,Preis in Mio,Quadratmeter,Stadt
0,119000.0,21.88,3938.0,Berlin
1,250000.0,27.95,3986.0,München
2,250000.0,16.09,2574.0,Köln
3,145000.0,27.58,4155.0,München
4,110000.0,23.76,3795.0,Berlin
5,246000.0,22.88,2773.0,München


In [10]:
X = df[["Gewinn", "Quadratmeter"]].values
Y = df[["Preis in Mio"]].values

In [11]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

kf = KFold()

for train_index, test_index in kf.split(X):
    print("train: " + str(train_index))
    print("test: " + str(test_index))
    print("-----------")
    X_test = X[test_index]
    X_train = X[train_index]
    
    y_test = Y[test_index]
    y_train = Y[train_index]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    print(model.score(X_test, y_test))


train: [2 3 4 5]
test: [0 1]
-----------
-0.23437406150775944
train: [0 1 4 5]
test: [2 3]
-----------
0.28817502863345124
train: [0 1 2 3]
test: [4 5]
-----------
-87.19441129592327




# Demonstration Shuffle und n_splits

In [13]:
import pandas as pd

df = pd.read_csv("./hotels.csv")

df.sort_values("Stadt")

Unnamed: 0,Gewinn,Preis in Mio,Quadratmeter,Stadt
0,119000.0,21.88,3938.0,Berlin
81,25000.0,9.07,1002.0,Berlin
80,124000.0,26.31,5201.0,Berlin
77,64000.0,14.90,2220.0,Berlin
148,35000.0,15.80,2281.0,Berlin
...,...,...,...,...
115,76000.0,18.78,2186.0,München
82,58000.0,19.93,3306.0,München
117,33000.0,16.04,1505.0,München
13,62000.0,17.08,1941.0,München


In [14]:
X = df[["Gewinn", "Quadratmeter"]].values
Y = df[["Preis in Mio"]].values

In [15]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

kf = KFold(n_splits = 4, shuffle = True)

for train_index, test_index in kf.split(X):
    print("train: " + str(train_index))
    print("test: " + str(test_index))
    print("-----------")
    X_test = X[test_index]
    X_train = X[train_index]
    
    y_test = Y[test_index]
    y_train = Y[train_index]
    
    # Lineare Regression trainieren
    model = LinearRegression()
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test))

train: [  0   1   2   3   6   7   8   9  11  13  16  17  18  19  20  21  22  23
  24  26  27  28  29  30  31  32  33  35  37  38  39  40  42  43  44  45
  47  48  49  50  51  53  54  55  58  59  60  61  62  63  64  65  67  69
  70  71  72  73  75  76  77  78  80  81  82  83  84  85  86  87  88  89
  91  92  94  95  96  99 101 103 107 108 109 110 112 114 116 117 120 121
 122 123 124 126 127 128 129 130 131 132 133 134 137 139 140 141 143 144
 145 146 148 149]
test: [  4   5  10  12  14  15  25  34  36  41  46  52  56  57  66  68  74  79
  90  93  97  98 100 102 104 105 106 111 113 115 118 119 125 135 136 138
 142 147]
-----------
0.8217555873813684
train: [  0   1   2   3   4   5   6   7   8   9  10  12  14  15  16  17  19  20
  21  25  26  27  28  32  33  34  35  36  38  39  40  41  44  45  46  48
  49  50  51  52  53  54  56  57  59  61  62  65  66  67  68  69  70  71
  72  73  74  78  79  80  81  82  85  86  87  88  89  90  91  92  93  94
  96  97  98 100 102 104 105 106 107 109 110 

# Kurzschreibweise

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import numpy as np

scores = cross_val_score(LinearRegression(), X, Y, cv = KFold(n_splits = 10))

print(scores)
print(np.mean(scores))

[0.65017794 0.64157437 0.7682274  0.75577885 0.81121291 0.87869648
 0.82420155 0.82443676 0.89919227 0.92067196]
0.7974170485910209


# Zusatz: Repeated K-Fold Cross-Validation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

import numpy as np

scores = cross_val_score(LinearRegression(), X, Y, cv = RepeatedKFold(n_repeats = 1000))

print(scores)
print(np.mean(scores))