In [39]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib

df = pd.read_csv("../data/fakedata.csv", skipinitialspace=True)

In [40]:
df.head(24)

Unnamed: 0,id,typeofday,day,hour,temperature,client
0,0,2,1,0,6.9,36
1,1,2,1,1,7.483333,61
2,2,2,1,2,8.066667,48
3,3,2,1,3,8.65,54
4,4,2,1,4,9.233333,63
5,5,2,1,5,9.816667,57
6,6,2,1,6,10.4,45
7,7,2,1,7,10.983333,56
8,8,2,1,8,11.566667,63
9,9,2,1,9,12.15,49


In [41]:
df.shape

(24442, 6)

In [42]:
df = df.drop('id', axis ='columns')
df.head()

Unnamed: 0,typeofday,day,hour,temperature,client
0,2,1,0,6.9,36
1,2,1,1,7.483333,61
2,2,1,2,8.066667,48
3,2,1,3,8.65,54
4,2,1,4,9.233333,63


In [43]:
from sklearn.model_selection import train_test_split

df.describe()

Unnamed: 0,typeofday,day,hour,temperature,client
count,24442.0,24442.0,24442.0,24442.0,24442.0
mean,0.321659,2.998691,11.515424,12.572956,50.300589
std,0.534138,2.000204,6.927508,9.943218,27.71471
min,0.0,0.0,0.0,-7.4,0.0
25%,0.0,1.0,6.0,4.233333,29.0
50%,0.0,3.0,12.0,12.35,47.0
75%,1.0,5.0,18.0,21.033333,72.0
max,2.0,6.0,24.0,33.2,139.0


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24442 entries, 0 to 24441
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   typeofday    24442 non-null  int64  
 1   day          24442 non-null  int64  
 2   hour         24442 non-null  int64  
 3   temperature  24442 non-null  float64
 4   client       24442 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 954.9 KB


In [48]:
from sklearn.model_selection import ShuffleSplit


In [53]:
split = ShuffleSplit(n_splits=1, test_size=0.2, random_state = 42)
for train_index, test_index in split.split(df, df["client"]):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]


Unnamed: 0,typeofday,day,hour,temperature,client
15973,0,6,19,-0.066667,58
4273,0,3,9,21.05,38
14682,0,2,4,-0.066667,24
10034,0,5,0,0.3,15
3745,0,2,9,-2.15,19


In [55]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor

X = df.drop(['client'], axis = 'columns')
y = df.client
def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.372235,{'normalize': True}
1,lasso,0.367864,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.910376,"{'criterion': 'mse', 'splitter': 'best'}"


In [56]:
dt = DecisionTreeRegressor(criterion='mse', splitter='best')

In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=10)

dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.9121786415688183

In [59]:
X_train

Unnamed: 0,typeofday,day,hour,temperature
10539,0,5,1,0.483333
11794,0,1,8,24.166667
11876,0,4,18,12.316667
6673,0,5,7,1.483333
2261,0,3,20,14.433333
...,...,...,...,...
9372,0,5,10,15.633333
7291,1,2,21,-0.633333
17728,0,2,20,22.250000
7293,1,2,23,-1.800000


In [72]:
def predict_client(typeofday,day,hour,temperature):    
    

    x = np.array([0,4,16,-7.0])
    
    return dt.predict([x])[0]


predict_client(0,4,16,-7.0)

26.0

In [74]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [82]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(4, input_dim=4, kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
# evaluate model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=20, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))