In [1]:
import os
import re
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
import string
from sklearn.calibration import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, GlobalAveragePooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
from gensim.models import KeyedVectors

2024-10-20 20:42:17.471469: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-20 20:42:17.475044: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-20 20:42:17.493141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 20:42:17.517841: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 20:42:17.524627: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 20:42:17.544955: I tensorflow/core/platform/cpu_feature_gu

# Punto 3

In [None]:
def clean_gutenberg_text(text):
    text = re.sub(r'^.*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*', '', text, flags=re.DOTALL)
    text = re.sub(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?$', '', text, flags=re.DOTALL)
    authors = r'William Oberfield|James Branch Cabell|Wilhelm Raabe'
    text = re.sub(r'(?i)\b(?:' + authors + r')\b.*?(?=\n)', '', text, flags=re.DOTALL)
    text = re.sub(r'(Produced by.*?Distributed Proofreaders|Title:.*?Author:.*?Release date:.*?Language:.*?Credits:.*?Project Gutenberg Distributed Proofreaders)', '', text, flags=re.DOTALL)
    text = re.sub(r'(CONTENTS.*?THE AFTERWORD|BIBLIOGRAPHY|INDEX)', '', text, flags=re.DOTALL)
    text = re.sub(r'\n+', '\n', text)
    text = text.strip()
    
    return text

In [79]:
def load_embeddings(file_path):
    return KeyedVectors.load(file_path)

embedding_file_paths = {
    300: 'Books_300_G4.model',
    400: 'Books_400_G4.model',
    500: 'Books_500_G4.model', 
}

In [80]:
def split_into_segments(text, segment_size=200):
    words = text.split() 
    segments = [' '.join(words[i:i + segment_size]) for i in range(0, len(words), segment_size)]
    return segments

In [81]:
def load_books_data(base_path='books', segment_size=200):
    texts = []
    labels = []
    authors = os.listdir(base_path)

    for author in authors:
        author_path = os.path.join(base_path, author)
        for book_file in os.listdir(author_path):
            with open(os.path.join(author_path, book_file), 'r', encoding='utf-8') as f:
                book_text = f.read()
                clean_text = clean_gutenberg_text(book_text)
                segments = split_into_segments(clean_text, segment_size)
                texts.extend(segments)
                labels.extend([author] * len(segments))
    
    return texts, labels

In [82]:
embeddings = {size: load_embeddings(path) for size, path in embedding_file_paths.items()}

def create_dataset(texts, authors):
    df = pd.DataFrame({'text': texts, 'author': authors})
    return df

In [83]:
texts, labels = load_books_data()
dataset = create_dataset(texts, labels)

In [84]:
def build_simple_model(vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=200, weights=[embedding_matrix], trainable=False))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model , "simple"

In [85]:
def build_deep_model(vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=200, weights=[embedding_matrix], trainable=False))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(200, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model, "deep"

In [86]:
def build_batchnorm_model(vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=200, weights=[embedding_matrix], trainable=False))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(200, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization()) 
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model , "batchNorm"

In [87]:
def compile_and_train_model(model, X_train, y_train, X_val, y_val, size, arquitectureLabel):
  model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
  )
  checkpoint_callback = ModelCheckpoint(
    f'best_model_{size}_{arquitectureLabel}.keras',
    monitor='accuracy',
    save_best_only=True,
    verbose=1 
  )

  history = model.fit(
    X_train, 
    y_train, 
    validation_data=(X_val, y_val), 
    epochs=50, 
    batch_size=32,
    callbacks=[checkpoint_callback]
  )
  return history

In [88]:
results = {}
embedding_sizes = [300, 400, 500]
group_code = "G4"

for size in embedding_sizes:
    word2vecModel = Word2Vec.load(f"Books_{size}_G4.model")
    vocab_size = len(word2vecModel.wv.key_to_index)
    tokenizer = Tokenizer(num_words=vocab_size)
    texts, labels = load_books_data()
    dataset = create_dataset(texts, labels)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset['text'], dataset['author'], test_size=0.2, random_state=42)

    X_val, X_test, y_val, y_test = train_test_split(
        X_test, y_test, test_size=0.5, random_state=42)
    
    train_counts = pd.Series(y_train).value_counts().reset_index()
    train_counts.columns = ['class', 'count']
    train_counts['set'] = 'train'

    val_counts = pd.Series(y_val).value_counts().reset_index()
    val_counts.columns = ['class', 'count']
    val_counts['set'] = 'validation'

    test_counts = pd.Series(y_test).value_counts().reset_index()
    test_counts.columns = ['class', 'count']
    test_counts['set'] = 'test'

    summary = pd.concat([train_counts, val_counts, test_counts], ignore_index=True)

    summary = summary[['set', 'class', 'count']]

    display(summary)
    
    tokenizer.fit_on_texts(X_train)
    
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.transform(y_val)
    y_test = encoder.transform(y_test)
    
    X_train = tokenizer.texts_to_sequences(X_train)
    X_val = tokenizer.texts_to_sequences(X_val)
    X_test = tokenizer.texts_to_sequences(X_test)
    
    max_length = 200
    X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
    X_val = pad_sequences(X_val, maxlen=max_length, padding='post')
    X_test = pad_sequences(X_test, maxlen=max_length, padding='post')
    
    embedding_matrix = np.zeros((vocab_size, size))
    for word, index in word2vecModel.wv.key_to_index.items():
        embedding_matrix[index] = word2vecModel.wv[word]
            
    for modelToTrain in [build_simple_model , build_deep_model , build_batchnorm_model ]:
        print(f"Entrenando con embeddings de tamaño: {size}")

        model , arquitectureLabel  = modelToTrain(vocab_size , size, embedding_matrix)
        history = compile_and_train_model(model, X_train, y_train, X_val, y_val, size, arquitectureLabel)
        test_loss , test_accuracy = history.model.evaluate(X_test, y_test)
        results[f"{size}_{arquitectureLabel}"] = {"accuracy": test_accuracy , "loss": test_loss}

for size, metrics in results.items():
    print(f"Results for embeddings size {size}:")
    print(f"Accuracy: {metrics['accuracy']}")


Unnamed: 0,set,class,count
0,train,James Branch Cabell,680
1,train,Washington Irving,558
2,train,William Oberfield,51
3,validation,James Branch Cabell,86
4,validation,Washington Irving,71
5,validation,William Oberfield,4
6,test,James Branch Cabell,78
7,test,Washington Irving,77
8,test,William Oberfield,7


Entrenando con embeddings de tamaño: 300
Epoch 1/50




[1m34/41[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5166 - loss: 0.8704
Epoch 1: accuracy improved from -inf to 0.51280, saving model to best_model_300_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5155 - loss: 0.8662 - val_accuracy: 0.5342 - val_loss: 0.7917
Epoch 2/50
[1m39/41[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.5496 - loss: 0.8083
Epoch 2: accuracy improved from 0.51280 to 0.52521, saving model to best_model_300_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5479 - loss: 0.8098 - val_accuracy: 0.4410 - val_loss: 0.8042
Epoch 3/50
[1m37/41[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.5050 - loss: 0.8370
Epoch 3: accuracy did not improve from 0.52521
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5041 - loss: 0.8367 - val_accuracy:

Unnamed: 0,set,class,count
0,train,James Branch Cabell,680
1,train,Washington Irving,558
2,train,William Oberfield,51
3,validation,James Branch Cabell,86
4,validation,Washington Irving,71
5,validation,William Oberfield,4
6,test,James Branch Cabell,78
7,test,Washington Irving,77
8,test,William Oberfield,7


Entrenando con embeddings de tamaño: 400
Epoch 1/50




[1m38/41[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.5178 - loss: 0.9219
Epoch 1: accuracy improved from -inf to 0.52754, saving model to best_model_400_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5188 - loss: 0.9173 - val_accuracy: 0.5342 - val_loss: 0.7955
Epoch 2/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5362 - loss: 0.8156
Epoch 2: accuracy did not improve from 0.52754
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5360 - loss: 0.8159 - val_accuracy: 0.5342 - val_loss: 0.7891
Epoch 3/50
[1m40/41[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.5348 - loss: 0.8457
Epoch 3: accuracy did not improve from 0.52754
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5344 - loss: 0.8450 - val_accuracy: 0.5342 - val_loss: 0.7916
Epoch 4/50
[1m39/41[

Unnamed: 0,set,class,count
0,train,James Branch Cabell,680
1,train,Washington Irving,558
2,train,William Oberfield,51
3,validation,James Branch Cabell,86
4,validation,Washington Irving,71
5,validation,William Oberfield,4
6,test,James Branch Cabell,78
7,test,Washington Irving,77
8,test,William Oberfield,7


Entrenando con embeddings de tamaño: 500
Epoch 1/50




[1m33/41[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5447 - loss: 0.8975
Epoch 1: accuracy improved from -inf to 0.53452, saving model to best_model_500_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5430 - loss: 0.8867 - val_accuracy: 0.4410 - val_loss: 0.7963
Epoch 2/50
[1m33/41[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.4883 - loss: 0.8330
Epoch 2: accuracy did not improve from 0.53452
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4930 - loss: 0.8332 - val_accuracy: 0.5342 - val_loss: 0.7945
Epoch 3/50
[1m31/41[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5053 - loss: 0.8204
Epoch 3: accuracy did not improve from 0.53452
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5098 - loss: 0.8224 - val_accuracy: 0.5342 - val_loss: 0.8011
Epoch 4/50
[1m36/41[

In [90]:
pd.DataFrame(results)

Unnamed: 0,300_simple,300_deep,300_batchNorm,400_simple,400_deep,400_batchNorm,500_simple,500_deep,500_batchNorm
accuracy,0.481481,0.481481,0.493827,0.481481,0.481481,0.512346,0.481481,0.481481,0.481481
loss,0.845827,0.845354,0.840599,0.843392,0.843157,0.843133,0.865047,0.847224,0.844268
