In [1]:
import pandas as pd
import json
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud
import tensorflow as tf
import keras_tuner
from nltk.probability import FreqDist
from funciones import build_optimizer, get_sent_tokens,get_lemma,get_stems,remove_stop_words,get_word_tokens, remove_special_characters, tokenize_summary, filter_and_tokenize_words, get_word_counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from optuna.integration import OptunaSearchCV
from optuna.distributions import IntDistribution, CategoricalDistribution,LogUniformDistribution
from keras.preprocessing import sequence
from keras.layers import LSTM, Dense, Embedding, SimpleRNN, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical, pad_sequences




[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pickle
with open("data_clean.pkl", "rb") as f:
    data = pickle.load(f)

In [3]:
#usamos label encoder para codificar cada género
LE = LabelEncoder()
y=LE.fit_transform(data['genre'])
dict(zip(LE.classes_,LE.transform(LE.classes_)))

{"Children's literature": 0,
 'Crime Fiction': 1,
 'Fantasy': 2,
 'Fiction': 3,
 'Historical novel': 4,
 'Horror': 5,
 'Mystery': 6,
 'Science Fiction': 7,
 'Thriller': 8}

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['summary'], y, test_size=0.2, random_state=557) 

In [5]:
#aplicamos tf-idf para poder pasarlo a los modelos
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer() # se ha visto que funciona mejor sin limtar el max_df ni el max_features

# Fit and transform the documents
tf_train  = tfidf_vectorizer.fit_transform(X_train)

# Convert the TF-IDF matrix to a pandas DataFrame
tf_test = tfidf_vectorizer.transform(X_test)



In [6]:
def apply_tfidf(string):
    tfidf_vectorizer = TfidfVectorizer() # se ha visto que funciona mejor sin limtar el max_df ni el max_features
    # Fit and transform the documents
    tf_train  = tfidf_vectorizer.fit_transform(X_train)
    # Convert the TF-IDF matrix to a pandas DataFrame
    tf_test = tfidf_vectorizer.transform(string)
    return tf_test
    

# Modelos

## Random Forest

In [7]:
param_grid = {
    'n_estimators': [100, 300, 500, 1000, 1500],
    'max_depth': [10,20,50],
    'min_samples_split':[2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['log2','sqrt']
}
rf = RandomForestClassifier()

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=2,
    n_jobs=-1,
    verbose=True
)
# fit model on train data.
grid.fit(tf_train, y_train)

print("Best parameters found: ", grid.best_params_)
print("Best score found: ", grid.best_score_)
rf = grid.best_estimator_

Fitting 2 folds for each of 270 candidates, totalling 540 fits
Best parameters found:  {'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
Best score found:  0.507863537382047


In [8]:
#guardamos los resultados para poder recuperarlo sin ejecutar todo
import pickle
with open("rf_clean.pkl", 'wb') as f:
    pickle.dump(grid, f)

In [9]:
import pickle
with open("rf_clean.pkl", 'rb') as f:
    grid = pickle.load(f)

In [10]:
grid

In [11]:
rf = grid.best_estimator_

In [12]:
# make predictions for validation set.
y_pred = rf.predict(tf_test)

#Calculating the accuracy.
print( 'Accuracy Score :',accuracy_score(y_test,y_pred) )

#Printing the classification report.
print ('Report : ')
print(classification_report(y_test,y_pred))

Accuracy Score : 0.5471698113207547
Report : 
              precision    recall  f1-score   support

           0       0.73      0.34      0.46       267
           1       0.67      0.38      0.48       138
           2       0.58      0.52      0.55       288
           3       0.43      0.80      0.56       399
           4       0.00      0.00      0.00        99
           5       1.00      0.10      0.18        82
           6       0.89      0.18      0.29       136
           7       0.59      0.89      0.71       548
           8       0.00      0.00      0.00       110

    accuracy                           0.55      2067
   macro avg       0.54      0.36      0.36      2067
weighted avg       0.56      0.55      0.49      2067



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression

In [13]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

# fit model on train data.
clf.fit(tf_train, y_train)

# make predictions for validation set.
y_pred_lr = clf.predict(tf_test)

#Calculating the accuracy.
print( 'Accuracy Score :',accuracy_score(y_test,y_pred_lr))

#Printing the classification report.
print ('Report : ')
print(classification_report(y_test,y_pred_lr))
#.7916666666

Accuracy Score : 0.6468311562651186
Report : 
              precision    recall  f1-score   support

           0       0.63      0.53      0.57       267
           1       0.78      0.60      0.68       138
           2       0.64      0.65      0.64       288
           3       0.50      0.78      0.61       399
           4       0.85      0.22      0.35        99
           5       0.96      0.28      0.43        82
           6       0.83      0.53      0.65       136
           7       0.73      0.87      0.79       548
           8       0.88      0.20      0.33       110

    accuracy                           0.65      2067
   macro avg       0.75      0.52      0.56      2067
weighted avg       0.69      0.65      0.63      2067



In [14]:
tf_test

<2067x74701 sparse matrix of type '<class 'numpy.float64'>'
	with 319506 stored elements in Compressed Sparse Row format>

In [15]:
import numpy as np
print(clf.predict(apply_tfidf(["space time is flexible when dealing with aliens in space"]))) #tiene sentido que sea ciencia ficción
print(clf.predict(apply_tfidf(["The detective wasn't sure if the crime would be solvable"]))) #tiene sentido que sea crimen

[7]
[1]


## SVM

In [16]:
param_grid = {
    "C": CategoricalDistribution([.1, 1, 10, 100, 1000]),
    "kernel": CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),
    "degree": IntDistribution(0, 5),
    "gamma": CategoricalDistribution(["auto","scale"]),
    "shrinking": CategoricalDistribution([True, False]),
    "probability": CategoricalDistribution([True, False])
}
svc = SVC()

rs = OptunaSearchCV(
    estimator=svc,
    param_distributions=param_grid,
    n_trials=50,
    timeout=15*60,
    cv=5,
    random_state=99,
    n_jobs=-1,
    verbose=False
)
# fit model on train data.
rs.fit(tf_train, y_train)


  rs = OptunaSearchCV(
[I 2024-04-28 20:00:17,347] A new study created in memory with name: no-name-55d19cab-64e7-4006-beea-9b6212ac6530
[I 2024-04-28 20:09:52,421] Trial 2 finished with value: 0.24014034795114375 and parameters: {'C': 10, 'kernel': 'poly', 'degree': 0, 'gamma': 'scale', 'shrinking': True, 'probability': False}. Best is trial 2 with value: 0.24014034795114375.
[I 2024-04-28 20:11:18,562] Trial 4 finished with value: 0.6370687277757417 and parameters: {'C': 10, 'kernel': 'linear', 'degree': 3, 'gamma': 'scale', 'shrinking': False, 'probability': False}. Best is trial 4 with value: 0.6370687277757417.
[I 2024-04-28 20:11:33,389] Trial 5 finished with value: 0.6471096851497882 and parameters: {'C': 1, 'kernel': 'linear', 'degree': 3, 'gamma': 'scale', 'shrinking': False, 'probability': False}. Best is trial 5 with value: 0.6471096851497882.
[I 2024-04-28 20:12:16,180] Trial 10 finished with value: 0.6158973717494336 and parameters: {'C': 10, 'kernel': 'sigmoid', 'degree':

In [17]:
# len(rs.trials_)

In [18]:
# guardamos los resultados para poder recuperarlo sin ejecutar todo
import pickle
with open("svc_clean.pkl", 'wb') as f:
    pickle.dump(rs, f)

In [19]:
import pickle
with open("svc_clean.pkl", 'rb') as f:
    rs = pickle.load(f)

In [20]:
print("Best parameters found: ", rs.best_params_)
print("Best score found: ", rs.best_score_)
svc = rs.best_estimator_


Best parameters found:  {'C': 1, 'kernel': 'linear', 'degree': 3, 'gamma': 'scale', 'shrinking': False, 'probability': False}
Best score found:  0.6471096851497882


In [21]:
# make predictions for validation set.
y_pred = svc.predict(tf_test)

#Calculating the accuracy.
print( 'Accuracy Score :',accuracy_score(y_test,y_pred) )

#Printing the classification report.
print ('Report : ')
print(classification_report(y_test,y_pred))

Accuracy Score : 0.6686018384131591
Report : 
              precision    recall  f1-score   support

           0       0.60      0.57      0.59       267
           1       0.74      0.64      0.68       138
           2       0.64      0.68      0.66       288
           3       0.53      0.76      0.62       399
           4       0.73      0.37      0.49        99
           5       0.83      0.37      0.51        82
           6       0.74      0.56      0.64       136
           7       0.81      0.83      0.82       548
           8       0.75      0.43      0.54       110

    accuracy                           0.67      2067
   macro avg       0.71      0.58      0.62      2067
weighted avg       0.69      0.67      0.66      2067

