In [76]:
import os
import re
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.calibration import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
from gensim.models import KeyedVectors

# Punto 1

In [56]:
def clean_gutenberg_text(text):
    text = re.sub(r'^.*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*', '', text, flags=re.DOTALL)
    text = re.sub(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?$', '', text, flags=re.DOTALL)
    authors = r'William Oberfield|James Branch Cabell|Wilhelm Raabe'
    text = re.sub(r'(?i)\b(?:' + authors + r')\b.*?(?=\n)', '', text, flags=re.DOTALL)
    text = re.sub(r'(Produced by.*?Distributed Proofreaders|Title:.*?Author:.*?Release date:.*?Language:.*?Credits:.*?Project Gutenberg Distributed Proofreaders)', '', text, flags=re.DOTALL)
    text = re.sub(r'(CONTENTS.*?THE AFTERWORD|BIBLIOGRAPHY|INDEX)', '', text, flags=re.DOTALL)
    text = re.sub(r'\n+', '\n', text)
    text = text.strip()
    
    return text

In [57]:
def preprocess_text(text):
    text = clean_gutenberg_text(text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer() 
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return tokens

In [61]:
def load_books_from_directory(directory):
    corpus = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    tokens = preprocess_text(text)
                    corpus.append(tokens)
    return corpus

In [62]:
directory = 'books' 
corpus = load_books_from_directory(directory)

embedding_sizes = [200, 300, 400]
group_code = "G4"

for size in embedding_sizes:
    model = Word2Vec(sentences=corpus, vector_size=size, window=5, min_count=5, workers=4)
    
    model.save(f'Books_{size}_{group_code}.model')
    print(f"Saved model with size {size} as 'Books_{size}_{group_code}.model'")

print("Training completed.")


Saved model with size 200 as 'Books_200_G4.model'
Saved model with size 300 as 'Books_300_G4.model'
Saved model with size 400 as 'Books_400_G4.model'
Training completed.


#  Punto 2

In [None]:
pca = PCA(n_components=2)
reduced_vectors_pca = pca.fit_transform(vectors)

plt.figure(figsize=(12, 8))
plt.scatter(reduced_vectors_pca[:, 0], reduced_vectors_pca[:, 1])

for i, word in enumerate(words):
    plt.annotate(word, xy=(reduced_vectors_pca[i, 0], reduced_vectors_pca[i, 1]))

plt.title("2D Visualization of Most Similar Words to Main Characters (PCA)")
plt.show()


# Punto 3

In [102]:
def load_embeddings(file_path):
    return KeyedVectors.load(file_path)

embedding_file_paths = {
    200: 'Books_200_G4.model',
    300: 'Books_300_G4.model',
    400: 'Books_400_G4.model', 
}

In [99]:
def split_into_segments(text, segment_size=200):
    words = text.split() 
    segments = [' '.join(words[i:i + segment_size]) for i in range(0, len(words), segment_size)]
    return segments

In [100]:
def load_books_data(base_path='books', segment_size=200):
    texts = []
    labels = []
    authors = os.listdir(base_path)

    for author in authors:
        author_path = os.path.join(base_path, author)
        for book_file in os.listdir(author_path):
            with open(os.path.join(author_path, book_file), 'r', encoding='utf-8') as f:
                book_text = f.read()
                clean_text = clean_gutenberg_text(book_text)
                segments = split_into_segments(clean_text, segment_size)
                texts.extend(segments)
                labels.extend([author] * len(segments))
    
    return texts, labels

In [105]:
embeddings = {size: load_embeddings(path) for size, path in embedding_file_paths.items()}

def create_dataset(texts, authors, embedding_size):
    df = pd.DataFrame({'text': texts, 'author': authors})
    df['embedding'] = df['text'].apply(lambda x: np.mean(
        [embeddings[embedding_size].wv[word] for word in x.split() if word in embeddings[embedding_size].wv], 
        axis=0)
    )

    return df

In [None]:
texts, labels = load_books_data()
dataset = create_dataset(texts, labels)

In [91]:
train_texts, test_texts, train_authors, test_authors = train_test_split(
    dataset['embedding'].tolist(), dataset['author'].tolist(), test_size=0.2, random_state=42)

val_texts, test_texts, val_authors, test_authors = train_test_split(
    test_texts, test_authors, test_size=0.5, random_state=42)

summary = pd.DataFrame({
    'set': ['train', 'validation', 'test'],
    'count': [
        pd.Series(train_authors).value_counts(),
        pd.Series(val_authors).value_counts(),
        pd.Series(test_authors).value_counts(),
    ]
})

print(summary)

          set                                              count
0       train  James Branch Cabell    672
Wilhelm Raabe      ...
1  validation  James Branch Cabell    77
Wilhelm Raabe       ...
2        test  James Branch Cabell    95
Wilhelm Raabe       ...


In [70]:
def build_simple_model(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))  # 3 classes for classification
    return model

In [71]:
def build_deep_model(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model

In [24]:
def build_batchnorm_model(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization()) 
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model

In [92]:
def compile_and_train_model(model, train_texts, train_authors, val_texts, val_authors):
  model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
  )
  history = model.fit(
    np.array(train_texts), 
    np.array(train_authors), 
    validation_data=(np.array(val_texts), np.array(val_authors)), 
    epochs=10, 
    batch_size=32
  )
  return history

In [74]:
embedding_sizes = [200, 300, 400]
results = {}

for size in embedding_sizes:
    print(f"Entrenando con embeddings de tamaño: {size}")
    dataset['embedding'] = dataset['text'].apply(lambda x: np.mean([embeddings[size].wv[word] for word in x.split() if word in embeddings[size].wv], axis=0))

    train_texts, test_texts, train_authors, test_authors = train_test_split(
        dataset['embedding'].tolist(), dataset['author'].tolist(), test_size=0.2, random_state=42)

    val_texts, test_texts, val_authors, test_authors = train_test_split(
        test_texts, test_authors, test_size=0.5, random_state=42)

    model = create_model(size)
    history = compile_and_train_model(model, train_texts, train_authors, val_texts, val_authors)

    results[size] = evaluate_model(model, test_texts, test_authors)

# Resumen de los resultados
for size, metrics in results.items():
    print(f"Results for embeddings size {size}:")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['weighted avg']['precision']}")
    print(f"Recall: {metrics['weighted avg']['recall']}\n")

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9847 - loss: 0.0551 
Accuracy: 0.9819276928901672
Precision: 0.982344763670065
Recall: 0.9819277108433735
