In [1]:
import pandas as pd
import pathlib2 as pathlib

from gensim.utils import tokenize
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV

import numpy as np
import re

import joblib
randomstate=313

In [7]:
cwd=pathlib.Path.cwd()
datapath=cwd.joinpath('data')
datawreviewsfile=datapath.joinpath('processed/innerJoinData.csv')
# dataworeviewsfile=datapath.joinpath('raw/collaborative_book_metadata_with_genredummies.csv')
models_path=cwd.joinpath('models')

datadf=pd.read_csv(datawreviewsfile,sep=';')
# fulldatadf=pd.read_csv(dataworeviewsfile,sep=';')
datadf=datadf.drop(['biography', 'children', 'comics', 'crime', 'fantasy', 'fiction',
       'graphic', 'historicalfiction', 'history', 'mystery', 'nonfiction',
       'paranormal', 'poetry', 'romance', 'thriller', 'youngadult'],axis=1)

def remove_punctuation(x):
    cleanedtext=re.sub('[^A-Za-z0-9]+', ' ', x)
    return cleanedtext
datadf['description']=datadf.description.transform(remove_punctuation)
datadf['genre']=datadf.genre.apply(lambda x: x.replace("'","")) # remove '
datadf['genre']=datadf.genre.apply(lambda x: x.replace("-","")) # revome - 
datadf['genre']=datadf.genre.apply(lambda x: x.replace(" ","")) # remove spaces
datadf['genre']=datadf.genre.apply(lambda x: x[1:-1].split(',')) # split into list

mlbmodelpath=models_path.joinpath('mlb_model')
mlbmodel=joblib.load(mlbmodelpath.joinpath('mlbmodel.pkl').as_posix())
newcols=mlbmodel.classes_
pred=mlbmodel.transform(datadf.genre)
preddf=pd.DataFrame(pred,columns=newcols)
datadf=datadf.join(preddf)

descriptionmodelpath=models_path.joinpath('description_models')
descmodel=joblib.load(descriptionmodelpath.joinpath('gridamodel.pkl').as_posix())
newcols=descmodel.best_estimator_.named_steps['lda'].get_feature_names_out()
pred=descmodel.best_estimator_.transform(datadf.description)
preddf=pd.DataFrame(pred,columns=newcols)
datadf=datadf.join(preddf)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
# datadf.head()
datadf.columns

Index(['title_x', 'book_id_x', 'user_id_mapping', 'book_id_mapping',
       'Predicted Rating', 'Actual Rating', 'book_id_y', 'title_y',
       'image_url', 'url', 'num_pages', 'ratings_count', 'description',
       'genre', 'name', 'num_genres', 'biography', 'children', 'comics',
       'crime', 'fantasy', 'fiction', 'graphic', 'historicalfiction',
       'history', 'mystery', 'nonfiction', 'paranormal', 'poetry', 'romance',
       'thriller', 'youngadult', 'latentdirichletallocation0',
       'latentdirichletallocation1', 'latentdirichletallocation2'],
      dtype='object')

In [3]:
# drop columns not for prediction
# copied list:
# ['title_x','book_id_x','book_id_mapping','Predicted Rating','book_id_y', 'title_y', 'image_url', 'url','description', 'genre','num_genres']
# keep:'user_id_mapping' for user identification --> would be replaced by cluster representation, concern: as a numerical value implies false relationship between values
# target: 'Actual Rating'
dropcols=['title_x','book_id_x','book_id_mapping','Predicted Rating','book_id_y', 'title_y', 'image_url', 'url','description', 'genre','num_genres','name'] #drop author name too, as binarization too big and few duplicates
datadf=datadf.drop(dropcols,axis=1)

In [40]:
datadf.columns

Index(['user_id_mapping', 'Actual Rating', 'num_pages', 'ratings_count',
       'biography', 'children', 'comics', 'crime', 'fantasy', 'fiction',
       'graphic', 'historicalfiction', 'history', 'mystery', 'nonfiction',
       'paranormal', 'poetry', 'romance', 'thriller', 'youngadult',
       'latentdirichletallocation0', 'latentdirichletallocation1',
       'latentdirichletallocation2'],
      dtype='object')

In [4]:
xtrain,xtest,ytrain,ytest=train_test_split(datadf.drop(['Actual Rating'],axis=1),datadf['Actual Rating'],test_size=0.1,random_state=randomstate,shuffle=True)

In [5]:
print(len(xtrain), len(ytrain))
print(len(xtest),len(ytest))

17841 17841
1983 1983


In [49]:
xtrain.dtypes

user_id_mapping                 int64
num_pages                       int64
ratings_count                   int64
biography                       int32
children                        int32
comics                          int32
crime                           int32
fantasy                         int32
fiction                         int32
graphic                         int32
historicalfiction               int32
history                         int32
mystery                         int32
nonfiction                      int32
paranormal                      int32
poetry                          int32
romance                         int32
thriller                        int32
youngadult                      int32
latentdirichletallocation0    float64
latentdirichletallocation1    float64
latentdirichletallocation2    float64
dtype: object

## Regression

In [9]:
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
predictionmodelpath=models_path.joinpath('prediction_models')


In [58]:
modela=svm.SVR(max_iter=150)
paramsa={'kernel':['linear','poly','rbf','sigmoid'],
         'degree':[3,4],
         'gamma':['auto','scale'],
         'coef0':[0.0,0.5,1],
         'tol':[0.001,0.003,0.01],
         'C':[1.0,0.5,2.0],
         'epsilon':[0.1,0.5],
         'shrinking':[True,False]
         }
grida=GridSearchCV(modela,paramsa,n_jobs=-1,cv=5,return_train_score=True)

modelb=DecisionTreeRegressor(random_state=randomstate)
paramsb={'criterion':['squared_error','friedman_mse','absolute_error','poisson'],
         'splitter':['best','random'],
         'max_depth':[None,5,10],
         'min_samples_split':[2,25,100],
         'min_samples_leaf':[1,100,50],
         'max_features':[None,'log2',5],
         'max_leaf_nodes':[None,10,20],
         'ccp_alpha':[0.0,0.2,0.5]
         }
gridb=GridSearchCV(modelb,paramsb,n_jobs=-1,cv=5,return_train_score=True)

In [59]:
gridamodel=grida.fit(xtrain,ytrain)
joblib.dump(gridamodel,predictionmodelpath.joinpath('regsvmmodel.pkl').as_posix())

gridbmodel=gridb.fit(xtrain,ytrain)
joblib.dump(gridbmodel,predictionmodelpath.joinpath('regdtmodel.pkl').as_posix())

540 fits failed out of a total of 8640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\svm\_base.py", line 268, in fit
    raise ValueError(
ValueError: The dual coefficients or intercepts are not finite. The input data may contain large values and 

['c:/Users/lekle/Projects_Code/HA_MaschinellesLernen_MADS23oB/models/prediction_models/regdtmodel.pkl']

## Classification

In [15]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [16]:
modelc=SVC(max_iter=150,random_state=randomstate)
paramsc={
    'C':[1.0,0.5,2.0],
    'kernel':['linear','poly','rbf','sigmoid'],
    'degree':[3,4],
    'gamma':['auto','scale'],
    'coef0':[0.0,0.5,1],
    'tol':[0.001,0.003,0.01],
    'shrinking':[True,False],
    'decision_function_shape':['ovo','ovr']
}
gridc=GridSearchCV(modelc,paramsc,n_jobs=-1,cv=5,return_train_score=True)

modeld=DecisionTreeClassifier(random_state=randomstate)
paramsd={
    'criterion':['gini','entropy','log_loss'],
    'splitter':['best','random'],
    'max_depth':[None,5,10],
    'min_samples_split':[2,25,100],
    'min_samples_leaf':[1,100,50],
    'max_features':[None,'log2',5],
    'max_leaf_nodes':[None,10,20],
    'ccp_alpha':[0.0,0.2,0.5]    
}
gridd=GridSearchCV(modeld,paramsd,n_jobs=-1,cv=5,return_train_score=True)

In [17]:
# gridcmodel=gridc.fit(xtrain,ytrain)
# joblib.dump(gridcmodel,predictionmodelpath.joinpath('classsvmmodel.pkl').as_posix())

griddmodel=gridd.fit(xtrain,ytrain)
joblib.dump(griddmodel,predictionmodelpath.joinpath('clasdtmodel.pkl').as_posix())

['c:/Users/lekle/Projects_Code/HA_MaschinellesLernen_MADS23oB/models/prediction_models/clasdtmodel.pkl']

In [10]:
clasdtmodel=joblib.load(predictionmodelpath.joinpath('clasdtmodel.pkl').as_posix())
classvmmodel=joblib.load(predictionmodelpath.joinpath('classsvmmodel.pkl').as_posix())

regdtmodel=joblib.load(predictionmodelpath.joinpath('clasdtmodel.pkl').as_posix())
regsvmmodel=joblib.load(predictionmodelpath.joinpath('regsvmmodel.pkl').as_posix())


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

In [12]:
predregsvm=regsvmmodel.predict(xtest)
predregdt=regdtmodel.predict(xtest)

msesvm=mean_squared_error(ytest,predregsvm)
msedt=mean_squared_error(ytest,predregdt)
r2svm=r2_score(ytest,predregsvm)
r2dt=r2_score(ytest,predregdt)

In [11]:
print(f'MSE SVM: {msesvm}, MSE DT: {msedt}')

MSE SVM: 1.3435918142788636, MSE DT: 1.0332349609141325


In [14]:
print(f'R2 SVM: {r2svm}, R2 DT: {r2dt}')

R2 SVM: -0.20169660528880606, R2 DT: -0.2723536997823319


In [24]:
predclassvm=classvmmodel.predict(xtest)
predclasdt=clasdtmodel.predict(xtest)

accsvm=accuracy_score(ytest,predclassvm)
accdt=accuracy_score(ytest,predclasdt)
msesvm2=mean_squared_error(ytest,predclassvm)
msedt2=mean_squared_error(ytest,predclasdt)

In [25]:
print(f'Accuracy SVM: {accsvm}, Accuracy DT: {accdt}')
print(f'MSE SVM: {msesvm2}, MSE DT: {msedt2}')

Accuracy SVM: 0.3398890569843671, Accuracy DT: 0.38628340897629854
MSE SVM: 2.017650025214322, MSE DT: 1.4225920322743317


## Regression and Classification NN

In [4]:
import tensorflow as tf
from optuna import Trial, create_study
from sklearn.model_selection import KFold
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# kf = KFold(n_splits=5, shuffle=True, random_state=randomstate)

In [16]:
# for x in kf.split(xtrain):
#     s=datadf.loc[x[0]]
#     print(s.head())
#     break

In [28]:
# xtrain.shape[1]

22

In [5]:
xtrain,xtest,ytrain,ytest=train_test_split(datadf.drop(['Actual Rating'],axis=1),datadf['Actual Rating'],test_size=0.1,random_state=randomstate,shuffle=True)
xtrain,xtest,ytrain,ytest=xtrain.reset_index(drop=True),xtest.reset_index(drop=True),ytrain.reset_index(drop=True),ytest.reset_index(drop=True) #kf.split() needs reset index; if drop=False, wird index zu neuer Spalte
studyname='prediction_regression'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')


Directory already exists


In [16]:
# kf = KFold(n_splits=5, shuffle=True, random_state=randomstate)
# for x in kf.split(xtrain):
#         trdata=tf.convert_to_tensor(xtrain.loc[train])
#         trlabel=tf.convert_to_tensor(ytrain.loc[train])
#         print(trdata.shape,trlabel.shape)
#     break

       index  user_id_mapping  num_pages  ratings_count  biography  children  \
0       6101            28678        388          86210          0         0   
1      15750            59915        184         314286          1         1   
5       9736            69526        217         141600          1         0   
6       5147             7295        432         675927          0         0   
7       9540            39264        288         748926          0         0   
...      ...              ...        ...            ...        ...       ...   
17834  13312             5035        748         215791          0         1   
17835  19016            70029        359         152039          0         0   
17836  13257            57629        748         215791          0         1   
17837   6183            10617        388          86210          0         0   
17838  13343             6309        748         215791          0         1   

       comics  crime  fantasy  fiction 

In [6]:
def init_regmodel(trial):
    # model definition
    nlayers=trial.suggest_int('n_layers',1,10)
    model=tf.keras.Sequential()
    activation=trial.suggest_categorical('activation',['relu','tanh','sigmoid'])
    inputshape=xtrain.shape[1]
    model.add(tf.keras.layers.Dense(16, input_shape=(inputshape,),activation=activation))
    for i in range(nlayers):
        numhidden=trial.suggest_int(f'n_units_l{i}',32,128,log=True)
        model.add(tf.keras.layers.Dense(units=numhidden,activation=activation,name=f'layer{i}'))
    model.add(tf.keras.layers.Dense(units=1,activation='linear'))
    model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_squared_error'])
    return model

def reg_objective(trial:Trial,xtrain:pd.DataFrame,ytrain:pd.Series)->float:
    kf = KFold(n_splits=5, shuffle=True, random_state=randomstate)
    model=init_regmodel(trial)
    # training and evaluation
    msetest=[]
    for train,test in kf.split(xtrain):
        trdata=tf.convert_to_tensor(xtrain.loc[train])
        trlabel=tf.convert_to_tensor(ytrain.loc[train])
        evdata=tf.convert_to_tensor(xtrain.loc[test])
        evlabel=tf.convert_to_tensor(ytrain.loc[test])




        model.fit(trdata,trlabel,validation_split=0.1,epochs=10, batch_size=128,verbose=0) #more verbosity seems to crash in execution
        
        loss,mse=model.evaluate(evdata, evlabel)
        # predtrain=tmodel.predict(evdata)
        # mse=tf.keras.losses.MSE(evlabel, predtrain)
        msetest.append(mse)
    
    return np.mean(msetest)

In [7]:
storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

In [8]:
study=create_study(study_name=studyname,direction='minimize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False)
study.optimize(lambda trial: reg_objective(trial,xtrain,ytrain),n_trials=20,n_jobs=-1,show_progress_bar=True)

[I 2024-09-06 21:22:13,168] A new study created in RDB with name: prediction_regression
  0%|          | 0/20 [00:00<?, ?it/s]

  8/112 [=>............................] - ETA: 1s - loss: 0.8369 - mean_squared_error: 1.0851

                                      



  0%|          | 0/20 [01:45<?, ?it/s]

 25/112 [=====>........................] - ETA: 1s - loss: 0.8038 - mean_squared_error: 1.1218

                                      

 19/112 [====>.........................] - ETA: 1s - loss: 0.8218 - mean_squared_error: 1.1389



[I 2024-09-06 21:23:58,900] Trial 0 finished with value: 1.103722834587097 and parameters: {'n_layers': 3, 'activation': 'tanh', 'n_units_l0': 85, 'n_units_l1': 115, 'n_units_l2': 36}. Best is trial 5 with value: 1.1023431539535522.


  0%|          | 0/20 [01:45<?, ?it/s]



Best trial: 5. Best value: 1.10234:   0%|          | 0/20 [01:45<?, ?it/s]

 22/112 [====>.........................] - ETA: 1s - loss: 0.7980 - mean_squared_error: 1.0944

Best trial: 5. Best value: 1.10234:   5%|▌         | 1/20 [01:45<33:31, 105.87s/it]



Best trial: 5. Best value: 1.10234:  10%|█         | 2/20 [01:46<13:06, 43.69s/it] 



Best trial: 5. Best value: 1.10234:  10%|█         | 2/20 [01:47<13:06, 43.69s/it]

[I 2024-09-06 21:24:00,856] Trial 3 finished with value: 1.1031191110610963 and parameters: {'n_layers': 3, 'activation': 'sigmoid', 'n_units_l0': 68, 'n_units_l1': 110, 'n_units_l2': 39}. Best is trial 5 with value: 1.1023431539535522.


Best trial: 5. Best value: 1.10234:  15%|█▌        | 3/20 [01:48<06:59, 24.67s/it]

[I 2024-09-06 21:24:01,142] Trial 4 finished with value: 1.1027643442153932 and parameters: {'n_layers': 4, 'activation': 'tanh', 'n_units_l0': 78, 'n_units_l1': 91, 'n_units_l2': 58, 'n_units_l3': 54}. Best is trial 5 with value: 1.1023431539535522.


Best trial: 5. Best value: 1.10234:  20%|██        | 4/20 [01:48<04:00, 15.01s/it]



Best trial: 5. Best value: 1.10234:  20%|██        | 4/20 [01:52<04:00, 15.01s/it]

[I 2024-09-06 21:24:05,549] Trial 9 finished with value: 1.104007315635681 and parameters: {'n_layers': 2, 'activation': 'sigmoid', 'n_units_l0': 49, 'n_units_l1': 35}. Best is trial 5 with value: 1.1023431539535522.
[I 2024-09-06 21:24:05,605] Trial 11 finished with value: 205.5865447998047 and parameters: {'n_layers': 4, 'activation': 'relu', 'n_units_l0': 37, 'n_units_l1': 64, 'n_units_l2': 58, 'n_units_l3': 116}. Best is trial 5 with value: 1.1023431539535522.


Best trial: 5. Best value: 1.10234:  30%|███       | 6/20 [01:52<01:33,  6.68s/it]



Best trial: 5. Best value: 1.10234:  30%|███       | 6/20 [01:56<01:33,  6.68s/it]

[I 2024-09-06 21:24:09,451] Trial 2 finished with value: 1.1047450304031372 and parameters: {'n_layers': 8, 'activation': 'tanh', 'n_units_l0': 34, 'n_units_l1': 92, 'n_units_l2': 98, 'n_units_l3': 95, 'n_units_l4': 47, 'n_units_l5': 32, 'n_units_l6': 126, 'n_units_l7': 64}. Best is trial 5 with value: 1.1023431539535522.


Best trial: 5. Best value: 1.10234:  35%|███▌      | 7/20 [01:56<01:16,  5.92s/it]



Best trial: 5. Best value: 1.10234:  35%|███▌      | 7/20 [01:59<01:16,  5.92s/it]

[I 2024-09-06 21:24:12,861] Trial 13 finished with value: 1.1010230302810669 and parameters: {'n_layers': 9, 'activation': 'sigmoid', 'n_units_l0': 61, 'n_units_l1': 34, 'n_units_l2': 90, 'n_units_l3': 120, 'n_units_l4': 33, 'n_units_l5': 61, 'n_units_l6': 47, 'n_units_l7': 109, 'n_units_l8': 57}. Best is trial 13 with value: 1.1010230302810669.


Best trial: 13. Best value: 1.10102:  45%|████▌     | 9/20 [02:00<00:41,  3.76s/it]

[I 2024-09-06 21:24:13,063] Trial 12 finished with value: 1.1073248147964478 and parameters: {'n_layers': 4, 'activation': 'tanh', 'n_units_l0': 126, 'n_units_l1': 61, 'n_units_l2': 62, 'n_units_l3': 95}. Best is trial 13 with value: 1.1010230302810669.

                                                                                   



Best trial: 13. Best value: 1.10102:  45%|████▌     | 9/20 [02:01<00:41,  3.76s/it]

[I 2024-09-06 21:24:14,780] Trial 15 finished with value: 1.1041677236557006 and parameters: {'n_layers': 10, 'activation': 'tanh', 'n_units_l0': 47, 'n_units_l1': 63, 'n_units_l2': 87, 'n_units_l3': 72, 'n_units_l4': 82, 'n_units_l5': 37, 'n_units_l6': 93, 'n_units_l7': 109, 'n_units_l8': 50, 'n_units_l9': 40}. Best is trial 13 with value: 1.1010230302810669.

Best trial: 13. Best value: 1.10102:  50%|█████     | 10/20 [02:01<00:31,  3.19s/it]



                                                                                    



Best trial: 13. Best value: 1.10102:  50%|█████     | 10/20 [02:02<00:31,  3.19s/it]

[I 2024-09-06 21:24:15,294] Trial 6 finished with value: 2.0939876079559325 and parameters: {'n_layers': 9, 'activation': 'relu', 'n_units_l0': 63, 'n_units_l1': 82, 'n_units_l2': 52, 'n_units_l3': 75, 'n_units_l4': 55, 'n_units_l5': 33, 'n_units_l6': 64, 'n_units_l7': 48, 'n_units_l8': 89}. Best is trial 13 with value: 1.1010230302810669.

                                                                                    

[I 2024-09-06 21:24:15,317] Trial 1 finished with value: 1.1048293352127074 and parameters: {'n_layers': 10, 'activation': 'tanh', 'n_units_l0': 117, 'n_units_l1': 37, 'n_units_l2': 101, 'n_units_l3': 64, 'n_units_l4': 110, 'n_units_l5': 127, 'n_units_l6': 47, 'n_units_l7': 116, 'n_units_l8': 43, 'n_units_l9': 36}. Best is trial 13 with value: 1.1010230302810669.

Best trial: 13. Best value: 1.10102:  50%|█████     | 10/20 [02:02<00:31,  3.19s/it]



Best trial: 13. Best value: 1.10102:  50%|█████     | 10/20 [02:02<00:31,  3.19s/it]



Best trial: 13. Best value: 1.10102:  55%|█████▌    | 11/20 [02:02<00:21,  2.41s/it]



Best trial: 13. Best value: 1.10102:  55%|█████▌    | 11/20 [02:02<00:21,  2.41s/it]



Best trial: 13. Best value: 1.10102:  60%|██████    | 12/20 [02:02<00:19,  2.41s/it]

[I 2024-09-06 21:24:15,623] Trial 8 finished with value: 1.1060708999633788 and parameters: {'n_layers': 8, 'activation': 'tanh', 'n_units_l0': 43, 'n_units_l1': 58, 'n_units_l2': 95, 'n_units_l3': 52, 'n_units_l4': 83, 'n_units_l5': 99, 'n_units_l6': 52, 'n_units_l7': 127}. Best is trial 13 with value: 1.1010230302810669.

Best trial: 13. Best value: 1.10102:  65%|██████▌   | 13/20 [02:02<00:09,  1.37s/it]



Best trial: 13. Best value: 1.10102:  65%|██████▌   | 13/20 [02:02<00:09,  1.37s/it]

[I 2024-09-06 21:24:15,936] Trial 7 finished with value: 1.1041629552841186 and parameters: {'n_layers': 10, 'activation': 'sigmoid', 'n_units_l0': 77, 'n_units_l1': 80, 'n_units_l2': 36, 'n_units_l3': 49, 'n_units_l4': 42, 'n_units_l5': 69, 'n_units_l6': 36, 'n_units_l7': 46, 'n_units_l8': 101, 'n_units_l9': 35}. Best is trial 13 with value: 1.1010230302810669.


Best trial: 13. Best value: 1.10102:  70%|███████   | 14/20 [02:02<00:06,  1.11s/it]



Best trial: 13. Best value: 1.10102:  70%|███████   | 14/20 [02:03<00:06,  1.11s/it]

[I 2024-09-06 21:24:16,965] Trial 14 finished with value: 2.5916846990585327 and parameters: {'n_layers': 9, 'activation': 'relu', 'n_units_l0': 39, 'n_units_l1': 38, 'n_units_l2': 70, 'n_units_l3': 74, 'n_units_l4': 98, 'n_units_l5': 73, 'n_units_l6': 34, 'n_units_l7': 119, 'n_units_l8': 33}. Best is trial 13 with value: 1.1010230302810669.

Best trial: 13. Best value: 1.10102:  75%|███████▌  | 15/20 [02:03<00:05,  1.09s/it]



Best trial: 13. Best value: 1.10102:  75%|███████▌  | 15/20 [02:04<00:05,  1.09s/it]

[I 2024-09-06 21:24:17,166] Trial 10 finished with value: 1.1838032960891725 and parameters: {'n_layers': 10, 'activation': 'relu', 'n_units_l0': 71, 'n_units_l1': 43, 'n_units_l2': 32, 'n_units_l3': 128, 'n_units_l4': 86, 'n_units_l5': 48, 'n_units_l6': 73, 'n_units_l7': 120, 'n_units_l8': 73, 'n_units_l9': 57}. Best is trial 13 with value: 1.1010230302810669.


Best trial: 13. Best value: 1.10102:  80%|████████  | 16/20 [02:04<00:03,  1.19it/s]



Best trial: 13. Best value: 1.10102:  85%|████████▌ | 17/20 [02:23<00:17,  5.92s/it]

[I 2024-09-06 21:24:36,257] Trial 18 finished with value: 1.1052341699600219 and parameters: {'n_layers': 1, 'activation': 'tanh', 'n_units_l0': 93}. Best is trial 13 with value: 1.1010230302810669.


Best trial: 13. Best value: 1.10102:  85%|████████▌ | 17/20 [02:23<00:17,  5.92s/it]

[I 2024-09-06 21:24:36,823] Trial 16 finished with value: 42923.60053710938 and parameters: {'n_layers': 2, 'activation': 'relu', 'n_units_l0': 90, 'n_units_l1': 54}. Best is trial 13 with value: 1.1010230302810669.


Best trial: 13. Best value: 1.10102:  90%|█████████ | 18/20 [02:23<00:08,  4.39s/it]



Best trial: 13. Best value: 1.10102:  95%|█████████▌| 19/20 [02:24<00:03,  3.46s/it]

[I 2024-09-06 21:24:37,994] Trial 19 finished with value: 1.1027844905853272 and parameters: {'n_layers': 2, 'activation': 'tanh', 'n_units_l0': 98, 'n_units_l1': 52}. Best is trial 13 with value: 1.1010230302810669.


Best trial: 13. Best value: 1.10102: 100%|██████████| 20/20 [02:33<00:00,  7.67s/it]

[I 2024-09-06 21:24:46,585] Trial 17 finished with value: 1.2242587327957153 and parameters: {'n_layers': 9, 'activation': 'relu', 'n_units_l0': 41, 'n_units_l1': 38, 'n_units_l2': 54, 'n_units_l3': 63, 'n_units_l4': 68, 'n_units_l5': 77, 'n_units_l6': 98, 'n_units_l7': 72, 'n_units_l8': 92}. Best is trial 13 with value: 1.1010230302810669.





In [9]:
joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

['c:/Users/lekle/Projects_Code/HA_MaschinellesLernen_MADS23oB/models/prediction_regression/study_prediction_regression.pkl']

In [12]:
model = init_regmodel(study.best_trial)
model.fit(tf.convert_to_tensor(xtrain),tf.convert_to_tensor(ytrain),validation_split=0.1,epochs=10, batch_size=128,verbose=0)
model.save(thismodelpath.joinpath(f'fmodel_reg.h5').as_posix())
model.save(thismodelpath.joinpath(f'fmodel_reg.h5').as_posix())

INFO:tensorflow:Assets written to: c:/Users/lekle/Projects_Code/HA_MaschinellesLernen_MADS23oB/models/prediction_regression/fmodel_reg.pkl\assets


In [5]:
# ytrain=pd.Series(tf.keras.utils.to_categorical(datadf['Actual Rating']))
xtrain,xtest,ytrain,ytest=train_test_split(datadf.drop(['Actual Rating'],axis=1),datadf['Actual Rating'],test_size=0.1,random_state=randomstate,shuffle=True)
xtrain,xtest,ytrain,ytest=xtrain.reset_index(drop=True),xtest.reset_index(drop=True),ytrain.reset_index(drop=True),ytest.reset_index(drop=True)

studyname='prediction_classification'
thismodelpath=models_path.joinpath(f'{studyname}')
try: 
    thismodelpath.mkdir(exist_ok=False)
except FileExistsError:
    print('Directory already exists')

def init_clasmodel(trial):
    # model definition
    nlayers=trial.suggest_int('n_layers',1,10)
    model=tf.keras.Sequential()
    activation=trial.suggest_categorical('activation',['relu','tanh','sigmoid'])
    inputshape=xtrain.shape[1]
    model.add(tf.keras.layers.Dense(16, input_shape=(inputshape,),activation=activation))
    for i in range(nlayers):
        numhidden=trial.suggest_int(f'n_units_l{i}',32,128,log=True)
        model.add(tf.keras.layers.Dense(units=numhidden,activation=activation,name=f'layer{i}'))
    model.add(tf.keras.layers.Dense(units=5,activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def clas_objective(trial:Trial,xtrain:pd.DataFrame,ytrain:pd.Series)->float:
    kf = KFold(n_splits=5, shuffle=True, random_state=randomstate)
    ytrain=datadf['Actual Rating'].apply(lambda x: x-1)
    model=init_clasmodel(trial)
    # training and evaluation
    acctest=[]
    for train,test in kf.split(xtrain):
        trdata=tf.convert_to_tensor(xtrain.loc[train])
        trlabel=tf.convert_to_tensor(tf.keras.utils.to_categorical(ytrain.loc[train],num_classes=5))
        evdata=tf.convert_to_tensor(xtrain.loc[test])
        evlabel=tf.convert_to_tensor(tf.keras.utils.to_categorical(ytrain.loc[test],num_classes=5))




        model.fit(trdata,trlabel,validation_split=0.1,epochs=10, batch_size=128,verbose=0) #more verbosity seems to crash in execution
        
        loss,acc=model.evaluate(evdata, evlabel)
        # predtrain=tmodel.predict(evdata)
        # mse=tf.keras.losses.MSE(evlabel, predtrain)
        acctest.append(acc)
    
    return np.mean(acctest)

storage=thismodelpath.joinpath(f"{studyname}.db")
if storage.exists():
    storage.unlink()
else:
    print("No sqlite db found")

study=create_study(study_name=studyname,direction='maximize',storage=f'sqlite:///{storage.as_posix()}',load_if_exists=False)
study.optimize(lambda trial: clas_objective(trial,xtrain,ytrain),n_trials=20,n_jobs=-1,show_progress_bar=True)

joblib.dump(study,thismodelpath.joinpath(f'study_{study.study_name}.pkl').as_posix())

model = init_clasmodel(study.best_trial)
ytrain=datadf['Actual Rating'].apply(lambda x: x-1)
model.fit(tf.convert_to_tensor(xtrain),tf.convert_to_tensor(tf.keras.utils.to_categorical(ytrain,num_classes=5)),validation_split=0.1,epochs=10, batch_size=128,verbose=0)
model.save(thismodelpath.joinpath(f'fmodel_clas.h5').as_posix())
model.save(thismodelpath.joinpath(f'fmodel_clas.h5').as_posix())

Directory already exists


[I 2024-09-06 22:07:13,694] A new study created in RDB with name: prediction_classification
  0%|          | 0/20 [00:00<?, ?it/s]



                                      



  0%|          | 0/20 [01:40<?, ?it/s]



Best trial: 2. Best value: 0.340451:   0%|          | 0/20 [01:40<?, ?it/s]



Best trial: 2. Best value: 0.340451:   5%|▌         | 1/20 [01:40<31:48, 100.46s/it]



Best trial: 2. Best value: 0.340451:   5%|▌         | 1/20 [01:49<31:48, 100.46s/it]

[I 2024-09-06 22:09:03,383] Trial 1 finished with value: 0.32755870223045347 and parameters: {'n_layers': 4, 'activation': 'relu', 'n_units_l0': 45, 'n_units_l1': 41, 'n_units_l2': 90, 'n_units_l3': 123}. Best is trial 2 with value: 0.3404510736465454.


Best trial: 2. Best value: 0.340451:  10%|█         | 2/20 [01:49<14:03, 46.86s/it] 

[I 2024-09-06 22:09:03,466] Trial 13 finished with value: 0.2646112784743309 and parameters: {'n_layers': 1, 'activation': 'relu', 'n_units_l0': 40}. Best is trial 2 with value: 0.3404510736465454.
 21/112 [====>.........................] - ETA: 2s - loss: 1.3410 - accuracy: 0.3408

                                                                                   





[I 2024-09-06 22:09:09,867] Trial 14 finished with value: 0.34274940490722655 and parameters: {'n_layers': 3, 'activation': 'tanh', 'n_units_l0': 43, 'n_units_l1': 123, 'n_units_l2': 79}. Best is trial 14 with value: 0.34274940490722655.

Best trial: 2. Best value: 0.340451:  15%|█▌        | 3/20 [01:56<13:16, 46.86s/it]



Best trial: 14. Best value: 0.342749:  20%|██        | 4/20 [01:56<05:13, 19.59s/it]



                                                                                    

  5/112 [>.............................] - ETA: 1s - loss: 1.2556 - accuracy: 0.3562

Best trial: 14. Best value: 0.342749:  20%|██        | 4/20 [01:58<05:13, 19.59s/it]

[I 2024-09-06 22:09:12,484] Trial 7 finished with value: 0.33832228779792783 and parameters: {'n_layers': 8, 'activation': 'sigmoid', 'n_units_l0': 95, 'n_units_l1': 33, 'n_units_l2': 38, 'n_units_l3': 60, 'n_units_l4': 54, 'n_units_l5': 39, 'n_units_l6': 36, 'n_units_l7': 32}. Best is trial 8 with value: 0.34801824688911437.
 21/112 [====>.........................] - ETA: 2s - loss: 1.3377 - accuracy: 0.3408

                                                                                    





 23/112 [=====>........................] - ETA: 2s - loss: 1.3469 - accuracy: 0.3383[I 2024-09-06 22:09:12,868] Trial 3 finished with value: 0.34560939073562624 and parameters: {'n_layers': 5, 'activation': 'relu', 'n_units_l0': 72, 'n_units_l1': 104, 'n_units_l2': 38, 'n_units_l3': 88, 'n_units_l4': 64}. Best is trial 6 with value: 0.3508210301399231.


Best trial: 14. Best value: 0.342749:  20%|██        | 4/20 [01:59<05:13, 19.59s/it]

 25/112 [=====>........................] - ETA: 2s - loss: 1.3582 - accuracy: 0.3400

                                                                                    





[I 2024-09-06 22:09:12,915] Trial 8 finished with value: 0.34801824688911437 and parameters: {'n_layers': 8, 'activation': 'relu', 'n_units_l0': 83, 'n_units_l1': 39, 'n_units_l2': 90, 'n_units_l3': 73, 'n_units_l4': 123, 'n_units_l5': 79, 'n_units_l6': 114, 'n_units_l7': 44}. Best is trial 6 with value: 0.3508210301399231.


Best trial: 14. Best value: 0.342749:  20%|██        | 4/20 [01:59<05:13, 19.59s/it]

  1/112 [..............................] - ETA: 9s - loss: 1.1413 - accuracy: 0.4375

                                                                                    





  6/112 [>.............................] - ETA: 3s - loss: 1.2898 - accuracy: 0.3542[I 2024-09-06 22:09:13,020] Trial 15 finished with value: 0.2757110595703125 and parameters: {'n_layers': 5, 'activation': 'relu', 'n_units_l0': 55, 'n_units_l1': 58, 'n_units_l2': 50, 'n_units_l3': 103, 'n_units_l4': 45}. Best is trial 6 with value: 0.3508210301399231.
  3/112 [..............................] - ETA: 4s - loss: 1.2272 - accuracy: 0.3333

Best trial: 14. Best value: 0.342749:  20%|██        | 4/20 [01:59<05:13, 19.59s/it]



                                                                                    

[I 2024-09-06 22:09:13,150] Trial 6 finished with value: 0.3508210301399231 and parameters: {'n_layers': 10, 'activation': 'sigmoid', 'n_units_l0': 41, 'n_units_l1': 101, 'n_units_l2': 91, 'n_units_l3': 113, 'n_units_l4': 114, 'n_units_l5': 101, 'n_units_l6': 47, 'n_units_l7': 37, 'n_units_l8': 97, 'n_units_l9': 39}. Best is trial 6 with value: 0.3508210301399231.

Best trial: 14. Best value: 0.342749:  20%|██        | 4/20 [01:59<05:13, 19.59s/it]


  9/112 [=>............................] - ETA: 4s - loss: 1.2935 - accuracy: 0.3542

                                                                                    

 10/112 [=>............................] - ETA: 4s - loss: 1.3028 - accuracy: 0.3625[I 2024-09-06 22:09:13,150] Trial 10 finished with value: 0.3338371813297272 and parameters: {'n_layers': 6, 'activation': 'tanh', 'n_units_l0': 43, 'n_units_l1': 41, 'n_units_l2': 112, 'n_units_l3': 84, 'n_units_l4': 43, 'n_units_l5': 42}. Best is trial 6 with value: 0.3508210301399231.


Best trial: 6. Best value: 0.350821:  20%|██        | 4/20 [01:59<05:13, 19.59s/it]



Best trial: 6. Best value: 0.350821:  25%|██▌       | 5/20 [01:59<03:38, 14.55s/it]

 15/112 [===>..........................] - ETA: 3s - loss: 1.3467 - accuracy: 0.3521

Best trial: 6. Best value: 0.350821:  25%|██▌       | 5/20 [01:59<03:38, 14.55s/it]

 16/112 [===>..........................] - ETA: 4s - loss: 1.3511 - accuracy: 0.3418

Best trial: 6. Best value: 0.350821:  35%|███▌      | 7/20 [02:00<01:10,  5.46s/it]

 17/112 [===>..........................] - ETA: 4s - loss: 1.3548 - accuracy: 0.3382

Best trial: 6. Best value: 0.350821:  35%|███▌      | 7/20 [02:00<01:10,  5.46s/it]

 19/112 [====>.........................] - ETA: 3s - loss: 1.3559 - accuracy: 0.3421

Best trial: 6. Best value: 0.350821:  35%|███▌      | 7/20 [02:00<01:10,  5.46s/it]

 21/112 [====>.........................] - ETA: 3s - loss: 1.3429 - accuracy: 0.3408

                                                                                   



Best trial: 4. Best value: 0.350821:  40%|████      | 8/20 [02:00<01:05,  5.46s/it]

 22/112 [====>.........................] - ETA: 5s - loss: 1.3460 - accuracy: 0.3366

Best trial: 4. Best value: 0.350821:  50%|█████     | 10/20 [02:00<00:19,  1.94s/it]

  9/112 [=>............................] - ETA: 7s - loss: 1.2912 - accuracy: 0.3542 

Best trial: 4. Best value: 0.350821:  50%|█████     | 10/20 [02:00<00:19,  1.94s/it]



                                                                                    



Best trial: 4. Best value: 0.350821:  55%|█████▌    | 11/20 [02:01<00:17,  1.94s/it]



Best trial: 4. Best value: 0.350821:  55%|█████▌    | 11/20 [02:01<00:17,  1.94s/it]



Best trial: 4. Best value: 0.350821:  60%|██████    | 12/20 [02:01<00:12,  1.52s/it]



                                                                                    



Best trial: 4. Best value: 0.350821:  60%|██████    | 12/20 [02:02<00:12,  1.52s/it]

[I 2024-09-06 22:09:15,675] Trial 11 finished with value: 0.3487470388412476 and parameters: {'n_layers': 4, 'activation': 'sigmoid', 'n_units_l0': 55, 'n_units_l1': 61, 'n_units_l2': 99, 'n_units_l3': 64}. Best is trial 4 with value: 0.3508210301399231.


Best trial: 4. Best value: 0.350821:  60%|██████    | 12/20 [02:02<00:12,  1.52s/it]

[I 2024-09-06 22:09:15,773] Trial 9 finished with value: 0.3508210301399231 and parameters: {'n_layers': 5, 'activation': 'tanh', 'n_units_l0': 114, 'n_units_l1': 32, 'n_units_l2': 85, 'n_units_l3': 95, 'n_units_l4': 54}. Best is trial 4 with value: 0.3508210301399231.


Best trial: 4. Best value: 0.350821:  75%|███████▌  | 15/20 [02:02<00:04,  1.21it/s]

[I 2024-09-06 22:09:16,068] Trial 5 finished with value: 0.34633798599243165 and parameters: {'n_layers': 10, 'activation': 'sigmoid', 'n_units_l0': 35, 'n_units_l1': 111, 'n_units_l2': 44, 'n_units_l3': 63, 'n_units_l4': 47, 'n_units_l5': 59, 'n_units_l6': 121, 'n_units_l7': 66, 'n_units_l8': 124, 'n_units_l9': 72}. Best is trial 4 with value: 0.3508210301399231.


Best trial: 12. Best value: 0.350933:  75%|███████▌  | 15/20 [02:04<00:04,  1.21it/s]

[I 2024-09-06 22:09:18,081] Trial 12 finished with value: 0.35093313455581665 and parameters: {'n_layers': 9, 'activation': 'tanh', 'n_units_l0': 49, 'n_units_l1': 35, 'n_units_l2': 94, 'n_units_l3': 65, 'n_units_l4': 126, 'n_units_l5': 114, 'n_units_l6': 43, 'n_units_l7': 45, 'n_units_l8': 72}. Best is trial 12 with value: 0.35093313455581665.


Best trial: 12. Best value: 0.350933:  80%|████████  | 16/20 [02:04<00:04,  1.05s/it]



Best trial: 12. Best value: 0.350933:  80%|████████  | 16/20 [02:32<00:04,  1.05s/it]

[I 2024-09-06 22:09:45,844] Trial 18 finished with value: 0.2839562505483627 and parameters: {'n_layers': 3, 'activation': 'relu', 'n_units_l0': 44, 'n_units_l1': 47, 'n_units_l2': 74}. Best is trial 12 with value: 0.35093313455581665.


Best trial: 12. Best value: 0.350933:  85%|████████▌ | 17/20 [02:32<00:22,  7.38s/it]



Best trial: 12. Best value: 0.350933:  85%|████████▌ | 17/20 [02:33<00:22,  7.38s/it]

[I 2024-09-06 22:09:47,007] Trial 19 finished with value: 0.30194753110408784 and parameters: {'n_layers': 3, 'activation': 'relu', 'n_units_l0': 117, 'n_units_l1': 65, 'n_units_l2': 99}. Best is trial 12 with value: 0.35093313455581665.


Best trial: 12. Best value: 0.350933:  90%|█████████ | 18/20 [02:33<00:11,  5.81s/it]



Best trial: 12. Best value: 0.350933:  90%|█████████ | 18/20 [02:34<00:11,  5.81s/it]

[I 2024-09-06 22:09:47,832] Trial 16 finished with value: 0.3429735004901886 and parameters: {'n_layers': 9, 'activation': 'sigmoid', 'n_units_l0': 45, 'n_units_l1': 73, 'n_units_l2': 72, 'n_units_l3': 53, 'n_units_l4': 39, 'n_units_l5': 62, 'n_units_l6': 39, 'n_units_l7': 48, 'n_units_l8': 92}. Best is trial 12 with value: 0.35093313455581665.


Best trial: 12. Best value: 0.350933:  95%|█████████▌| 19/20 [02:34<00:04,  4.48s/it]



Best trial: 12. Best value: 0.350933:  95%|█████████▌| 19/20 [02:37<00:04,  4.48s/it]

[I 2024-09-06 22:09:51,132] Trial 17 finished with value: 0.3370324671268463 and parameters: {'n_layers': 9, 'activation': 'relu', 'n_units_l0': 109, 'n_units_l1': 72, 'n_units_l2': 38, 'n_units_l3': 90, 'n_units_l4': 49, 'n_units_l5': 35, 'n_units_l6': 113, 'n_units_l7': 61, 'n_units_l8': 33}. Best is trial 12 with value: 0.35093313455581665.


Best trial: 12. Best value: 0.350933: 100%|██████████| 20/20 [02:37<00:00,  7.87s/it]


NameError: name 'init_regmodel' is not defined

In [8]:
# model = init_clasmodel(study.best_trial)
# ytrain=datadf['Actual Rating'].apply(lambda x: x-1)
# model.fit(tf.convert_to_tensor(xtrain),tf.convert_to_tensor(tf.keras.utils.to_categorical(ytrain,num_classes=5)),validation_split=0.1,epochs=10, batch_size=128,verbose=0)
# model.save(thismodelpath.joinpath(f'fmodel_clas.h5').as_posix())
# model.save(thismodelpath.joinpath(f'fmodel_clas.h5').as_posix())

In [14]:
tf.keras.utils.to_categorical(ytrain)

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

In [16]:
tf.keras.utils.to_categorical(datadf['Actual Rating'])

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.]], dtype=float32)

In [43]:
tf.keras.utils.to_categorical(datadf['Actual Rating'],num_classes=5)

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [42]:
datadf['Actual Rating']=datadf['Actual Rating'].apply(lambda x:x-1)

In [46]:
tf.convert_to_tensor(tf.keras.utils.to_categorical(datadf['Actual Rating'],num_classes=5))

<tf.Tensor: shape=(19824, 5), dtype=float32, numpy=
array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)>

In [12]:
import joblib
import pandas as pd
import tensorflow as tf

In [14]:
predictionmodelpath=models_path.joinpath('prediction_models')

regsvmmodel=joblib.load(predictionmodelpath.joinpath('regsvmmodel.pkl').as_posix())
regdtmodel=joblib.load(predictionmodelpath.joinpath('regdtmodel.pkl').as_posix())
classvmmodel=joblib.load(predictionmodelpath.joinpath('classsvmmodel.pkl').as_posix())
clasdtmodel=joblib.load(predictionmodelpath.joinpath('clasdtmodel.pkl').as_posix())
regmodel = tf.keras.models.load_model(models_path.joinpath(f'prediction_regression/fmodel_reg.h5').as_posix())
classmodel=tf.keras.models.load_model(models_path.joinpath(f'prediction_classification/fmodel_clas.h5').as_posix())

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [15]:
dropcols=['title_x','book_id_x','book_id_mapping','book_id_y', 'title_y', 'image_url', 'url','description', 'genre','num_genres','name'] #drop author name too, as binarization too big and few duplicates
datadf=datadf.drop(dropcols,axis=1)
datadf.columns

KeyError: "['title_x', 'book_id_x', 'book_id_mapping', 'book_id_y', 'title_y', 'image_url', 'url', 'description', 'genre', 'num_genres', 'name'] not found in axis"

In [16]:
x=datadf[['user_id_mapping', 'num_pages',
       'ratings_count', 'biography', 'children', 'comics', 'crime', 'fantasy',
       'fiction', 'graphic', 'historicalfiction', 'history', 'mystery',
       'nonfiction', 'paranormal', 'poetry', 'romance', 'thriller',
       'youngadult', 'latentdirichletallocation0',
       'latentdirichletallocation1', 'latentdirichletallocation2']]
y=datadf[['Predicted Rating', 'Actual Rating',]]

In [51]:
y['regsvm']=regsvmmodel.best_estimator_.predict(x)
y['regdt']=regdtmodel.best_estimator_.predict(x)
y['classvm']=classvmmodel.best_estimator_.predict(x)
y['clasdt']=clasdtmodel.best_estimator_.predict(x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['regsvm']=regsvmmodel.best_estimator_.predict(x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['regdt']=regdtmodel.best_estimator_.predict(x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['classvm']=classvmmodel.best_estimator_.predict(x)
A value is trying to be set on a copy of a slice fro

In [45]:
y['regmodel']=regmodel.predict(tf.convert_to_tensor(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['regmodel']=regmodel.predict(tf.convert_to_tensor(x))


In [43]:
def get_class_pred(x):
    pred=classmodel.predict(tf.convert_to_tensor(x))
    classpred=[]
    for x in pred:
        classpred.append(np.argmax(x)+1)
    return classpred

y['clasmodel']=get_class_pred(x)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['clasmodel']=get_class_pred(x)


In [49]:
from sklearn.metrics import mean_squared_error

In [46]:
y.head()

Unnamed: 0,Predicted Rating,Actual Rating,regsvm,regdt,classmv,clasdt,clasmodel,regmodel
0,4.0,5,4.311521,4.487047,5,5,5,4.02972
1,4.1,4,4.300884,4.487047,5,5,5,4.02972
2,4.5,5,4.282279,4.487047,5,5,5,4.029719
3,4.2,5,4.291284,4.487047,5,5,5,4.02972
4,4.4,5,4.29893,4.487047,5,5,5,4.02972


In [52]:
baseline=mean_squared_error(y['Actual Rating'],y['Predicted Rating'])
regsvmscore=mean_squared_error(y['Actual Rating'],y['regsvm'])
regtdtscore=mean_squared_error(y['Actual Rating'],y['regdt'])
classvmscore=mean_squared_error(y['Actual Rating'],y['classvm'])
clasdtscore=mean_squared_error(y['Actual Rating'],y['clasdt'])
clasmodelscore=mean_squared_error(y['Actual Rating'],y['clasmodel'])
regmodelscore=mean_squared_error(y['Actual Rating'],y['regmodel'])

In [53]:
print(f'MSE Predicted Rating: {baseline}')
print(f'MSE regsvm: {regsvmscore}')
print(f'MSE regdt: {regtdtscore}')
print(f'MSE classsvm: {classvmscore}')
print(f'MSE clasdt: {clasdtscore}')
print(f'MSE clasmodel: {clasmodelscore}') # --> only predicts 5*s
print(f'MSE regmodel: {regmodelscore}')

MSE Predicted Rating: 0.7844668079096044
MSE regsvm: 1.3151503911513545
MSE regdt: 0.9944644826843734
MSE classsvm: 2.0205306698950767
MSE clasdt: 1.4129338175948345
MSE clasmodel: 2.3582526230831315
MSE regmodel: 1.1142164139117647


- Classification works worse
- regression tree works best
- still not as good as predicted rating, but closest (0,78 vs 0,99)

- a working user clustering might work
- normalization?, Scaling?

- improve 