In [1]:
import pandas as pd
import pathlib2 as pathlib

from gensim.utils import tokenize
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV

import numpy as np
import re

import joblib
randomstate=313

In [38]:
cwd=pathlib.Path.cwd()
datapath=cwd.joinpath('data')
datawreviewsfile=datapath.joinpath('processed/innerJoinData.csv')
# dataworeviewsfile=datapath.joinpath('raw/collaborative_book_metadata_with_genredummies.csv')
models_path=cwd.joinpath('models')

datadf=pd.read_csv(datawreviewsfile,sep=';')
# fulldatadf=pd.read_csv(dataworeviewsfile,sep=';')
datadf=datadf.drop(['biography', 'children', 'comics', 'crime', 'fantasy', 'fiction',
       'graphic', 'historicalfiction', 'history', 'mystery', 'nonfiction',
       'paranormal', 'poetry', 'romance', 'thriller', 'youngadult'],axis=1)

def remove_punctuation(x):
    cleanedtext=re.sub('[^A-Za-z0-9]+', ' ', x)
    return cleanedtext
datadf['description']=datadf.description.transform(remove_punctuation)
datadf['genre']=datadf.genre.apply(lambda x: x.replace("'","")) # remove '
datadf['genre']=datadf.genre.apply(lambda x: x.replace("-","")) # revome - 
datadf['genre']=datadf.genre.apply(lambda x: x.replace(" ","")) # remove spaces
datadf['genre']=datadf.genre.apply(lambda x: x[1:-1].split(',')) # split into list

mlbmodelpath=models_path.joinpath('mlb_model')
mlbmodel=joblib.load(mlbmodelpath.joinpath('mlbmodel.pkl').as_posix())
newcols=mlbmodel.classes_
pred=mlbmodel.transform(datadf.genre)
preddf=pd.DataFrame(pred,columns=newcols)
datadf=datadf.join(preddf)

descriptionmodelpath=models_path.joinpath('description_models')
descmodel=joblib.load(descriptionmodelpath.joinpath('gridamodel.pkl').as_posix())
newcols=descmodel.best_estimator_.named_steps['lda'].get_feature_names_out()
pred=descmodel.best_estimator_.transform(datadf.description)
preddf=pd.DataFrame(pred,columns=newcols)
datadf=datadf.join(preddf)


In [13]:
# datadf.head()
datadf.columns

Index(['title_x', 'book_id_x', 'user_id_mapping', 'book_id_mapping',
       'Predicted Rating', 'Actual Rating', 'book_id_y', 'title_y',
       'image_url', 'url', 'num_pages', 'ratings_count', 'description',
       'genre', 'name', 'num_genres', 'biography', 'children', 'comics',
       'crime', 'fantasy', 'fiction', 'graphic', 'historicalfiction',
       'history', 'mystery', 'nonfiction', 'paranormal', 'poetry', 'romance',
       'thriller', 'youngadult', 'latentdirichletallocation0',
       'latentdirichletallocation1', 'latentdirichletallocation2'],
      dtype='object')

In [39]:
# drop columns not for prediction
# copied list:
# ['title_x','book_id_x','book_id_mapping','Predicted Rating','book_id_y', 'title_y', 'image_url', 'url','description', 'genre','num_genres']
# keep:'user_id_mapping' for user identification --> would be replaced by cluster representation, concern: as a numerical value implies false relationship between values
# target: 'Actual Rating'
dropcols=['title_x','book_id_x','book_id_mapping','Predicted Rating','book_id_y', 'title_y', 'image_url', 'url','description', 'genre','num_genres','name'] #drop author name too, as binarization too big and few duplicates
datadf=datadf.drop(dropcols,axis=1)

In [40]:
datadf.columns

Index(['user_id_mapping', 'Actual Rating', 'num_pages', 'ratings_count',
       'biography', 'children', 'comics', 'crime', 'fantasy', 'fiction',
       'graphic', 'historicalfiction', 'history', 'mystery', 'nonfiction',
       'paranormal', 'poetry', 'romance', 'thriller', 'youngadult',
       'latentdirichletallocation0', 'latentdirichletallocation1',
       'latentdirichletallocation2'],
      dtype='object')

In [41]:
xtrain,xtest,ytrain,ytest=train_test_split(datadf.drop(['Actual Rating'],axis=1),datadf['Actual Rating'],test_size=0.1,random_state=randomstate,shuffle=True)

In [43]:
print(len(xtrain), len(ytrain))
print(len(xtest),len(ytest))

17841 17841
1983 1983


In [49]:
xtrain.dtypes

user_id_mapping                 int64
num_pages                       int64
ratings_count                   int64
biography                       int32
children                        int32
comics                          int32
crime                           int32
fantasy                         int32
fiction                         int32
graphic                         int32
historicalfiction               int32
history                         int32
mystery                         int32
nonfiction                      int32
paranormal                      int32
poetry                          int32
romance                         int32
thriller                        int32
youngadult                      int32
latentdirichletallocation0    float64
latentdirichletallocation1    float64
latentdirichletallocation2    float64
dtype: object

## Regression

In [57]:
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
predictionmodelpath=models_path.joinpath('prediction_models')


In [58]:
modela=svm.SVR(max_iter=150)
paramsa={'kernel':['linear','poly','rbf','sigmoid'],
         'degree':[3,4],
         'gamma':['auto','scale'],
         'coef0':[0.0,0.5,1],
         'tol':[0.001,0.003,0.01],
         'C':[1.0,0.5,2.0],
         'epsilon':[0.1,0.5],
         'shrinking':[True,False]
         }
grida=GridSearchCV(modela,paramsa,n_jobs=-1,cv=5,return_train_score=True)

modelb=DecisionTreeRegressor(random_state=randomstate)
paramsb={'criterion':['squared_error','friedman_mse','absolute_error','poisson'],
         'splitter':['best','random'],
         'max_depth':[None,5,10],
         'min_samples_split':[2,25,100],
         'min_samples_leaf':[1,100,50],
         'max_features':[None,'log2',5],
         'max_leaf_nodes':[None,10,20],
         'ccp_alpha':[0.0,0.2,0.5]
         }
gridb=GridSearchCV(modelb,paramsb,n_jobs=-1,cv=5,return_train_score=True)

In [59]:
gridamodel=grida.fit(xtrain,ytrain)
joblib.dump(gridamodel,predictionmodelpath.joinpath('regsvmmodel.pkl').as_posix())

gridbmodel=gridb.fit(xtrain,ytrain)
joblib.dump(gridbmodel,predictionmodelpath.joinpath('regdtmodel.pkl').as_posix())

540 fits failed out of a total of 8640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\svm\_base.py", line 268, in fit
    raise ValueError(
ValueError: The dual coefficients or intercepts are not finite. The input data may contain large values and 

['c:/Users/lekle/Projects_Code/HA_MaschinellesLernen_MADS23oB/models/prediction_models/regdtmodel.pkl']

## Classification

In [60]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [61]:
modelc=SVC(max_iter=150,random_state=randomstate)
paramsc={
    'C':[1.0,0.5,2.0],
    'kernel':['linear','poly','rbf','sigmoid'],
    'degree':[3,4],
    'gamma':['auto','scale'],
    'coef0':[0.0,0.5,1],
    'tol':[0.001,0.003,0.01],
    'shrinking':[True,False],
    'decision_function_shape':['ovo','ovr']
}
gridc=GridSearchCV(modelc,paramsc,n_jobs=-1,cv=5,return_train_score=True)

modeld=DecisionTreeClassifier(random_state=randomstate)
paramsd={
    'criterion':['gini','entropy','log_loss'],
    'splitter':['best','random'],
    'max_depth':[None,5,10],
    'min_samples_split':[2,25,100],
    'min_samples_leaf':[1,100,50],
    'max_features':[None,'log2',5],
    'max_leaf_nodes':[None,10,20],
    'ccp_alpha':[0.0,0.2,0.5]    
}
gridd=GridSearchCV(modeld,paramsd,n_jobs=-1,cv=5,return_train_score=True)

In [62]:
gridcmodel=gridc.fit(xtrain,ytrain)
joblib.dump(gridcmodel,predictionmodelpath.joinpath('classsvmmodel.pkl').as_posix())

gridbdmodel=gridd.fit(xtrain,ytrain)
joblib.dump(gridbmodel,predictionmodelpath.joinpath('clasdtmodel.pkl').as_posix())

540 fits failed out of a total of 8640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lekle\anaconda3\Lib\site-packages\sklearn\svm\_base.py", line 268, in fit
    raise ValueError(
ValueError: The dual coefficients or intercepts are not finite. The input data may contain large values and 

['c:/Users/lekle/Projects_Code/HA_MaschinellesLernen_MADS23oB/models/prediction_models/clasdtmodel.pkl']