In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
data = pd.read_csv('./perf-invoices.csv')
data.head()

Unnamed: 0,sleep,class,study,activity,happiness,performance
0,2,10,7,10,3,68
1,2,14,9,9,0,87
2,3,3,11,6,5,49
3,10,14,11,1,7,79
4,9,5,8,4,9,54


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   sleep        10000 non-null  int64
 1   class        10000 non-null  int64
 2   study        10000 non-null  int64
 3   activity     10000 non-null  int64
 4   happiness    10000 non-null  int64
 5   performance  10000 non-null  int64
dtypes: int64(6)
memory usage: 468.9 KB


In [11]:
data_copy = data.copy()
data_copy.describe()

Unnamed: 0,sleep,class,study,activity,happiness,performance
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,10.0719,7.9922,7.9506,5.5028,5.0041,50.288
std,5.497504,4.31029,4.32507,2.874199,3.156942,28.543748
min,1.0,1.0,1.0,1.0,0.0,1.0
25%,5.0,4.0,4.0,3.0,2.0,26.0
50%,10.0,8.0,8.0,5.0,5.0,50.0
75%,15.0,12.0,12.0,8.0,8.0,75.0
max,19.0,15.0,15.0,10.0,10.0,100.0


In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
import xgboost as xgb

from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [20]:
X_c = data_copy.drop('performance',axis=1).values
y_c = data_copy['performance'].values.reshape(-1,1)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c,y_c,test_size=0.2, random_state=42)

X_train_scaled = StandardScaler().fit_transform(X_train_c)
y_train_scaled = StandardScaler().fit_transform(y_train_c)
X_test_scaled = StandardScaler().fit_transform(X_test_c)
y_test_scaled = StandardScaler().fit_transform(y_test_c)

svr = SVR()

parameters =  { 'kernel' : ['rbf', 'sigmoid'],
                 'gamma' : [0.001, 0.01, 0.1, 1, 'scale'],
                 'tol' : [0.0001],
                 'C': [0.001, 0.01, 0.1, 1, 10, 100] }
svr_grid = GridSearchCV(estimator=svr, param_grid=parameters, cv=10, verbose=4, n_jobs=-1)
svr_grid.fit(X_train_scaled, y_train_scaled.ravel())

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  5.9min finished


GridSearchCV(cv=10, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 'scale'],
                         'kernel': ['rbf', 'sigmoid'], 'tol': [0.0001]},
             verbose=4)

In [21]:
svr = SVR(C=10, gamma=0.1, tol=0.0001)
svr.fit(X_train_scaled, y_train_scaled.ravel())
print(svr_grid.best_estimator_)
print(svr_grid.best_score_)

SVR(C=0.001, gamma=0.1, kernel='sigmoid', tol=0.0001)
-0.0010121261136006754


In [22]:
cv_svr = svr_grid.best_score_

y_pred_svr_train = svr.predict(X_train_scaled)
r2_score_svr_train = r2_score(y_train_scaled, y_pred_svr_train)

y_pred_svr_test = svr.predict(X_test_scaled)
r2_score_svr_test = r2_score(y_test_scaled, y_pred_svr_test)

rmse_svr = (np.sqrt(mean_squared_error(y_test_scaled, y_pred_svr_test)))

print('CV : {0:.3f}'.format(cv_svr.mean()))
print('R2_score (train) : {0:.3f}'.format(r2_score_svr_train))
print('R2 score (test) : {0:.3f}'.format(r2_score_svr_test))
print('RMSE : {0:.3f}'.format(rmse_svr))

CV : -0.001
R2_score (train) : 0.016
R2 score (test) : -0.051
RMSE : 1.025


In [23]:
import pickle

In [24]:
filename="model.pkl"
pickle.dump(svr,open(filename, "wb"))