In [1]:
import pandas as pd
df = pd.read_csv('spooky_cleaned.csv')
print(df.head(10))

        id                                               text author
0  id26305  this proces however afforded me no means of as...    EAP
1  id17569  it never once occurred to me that the fumbling...    HPL
2  id11008  in his left hand was a gold snuff box from whi...    EAP
3  id27763  how lovely is spring as we looked from windsor...    MWS
4  id12958  finding nothing else not even gold the superin...    HPL
5  id22965  a youth passed in solitude my best years spent...    MWS
6  id09674  the astronomer perhaps at this point took refu...    EAP
7  id13515         the surcingle hung in ribands from my body    EAP
8  id19322  i knew that you could not say to yourself ster...    EAP
9  id00912  i confess that neither the structure of langua...    MWS


In [2]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
author_encoded = encoder.fit_transform(df[['author']])

# Ajouter les colonnes encodées au DataFrame
df[encoder.get_feature_names_out(['author'])] = author_encoded

print(df.head())

        id                                               text author  \
0  id26305  this proces however afforded me no means of as...    EAP   
1  id17569  it never once occurred to me that the fumbling...    HPL   
2  id11008  in his left hand was a gold snuff box from whi...    EAP   
3  id27763  how lovely is spring as we looked from windsor...    MWS   
4  id12958  finding nothing else not even gold the superin...    HPL   

   author_EAP  author_HPL  author_MWS  
0         1.0         0.0         0.0  
1         0.0         1.0         0.0  
2         1.0         0.0         0.0  
3         0.0         0.0         1.0  
4         0.0         1.0         0.0  


In [3]:
y_labels = df["author"]
from sklearn.model_selection import StratifiedKFold

X = df['text'].values
y = y_labels.values

# Create stratified k-fold cross-validator
#It ensures each fold maintains the same class distribution as the original dataset, which is crucial for imbalanced data
skf = StratifiedKFold(n_splits=int(1/0.3), shuffle=True, random_state=0)  # n_splits=3 for ~30% test size


for train_index, test_index in skf.split(X, y):
    break  


X_train = df['text'].iloc[train_index]
X_test = df['text'].iloc[test_index]
y_train = y_labels.iloc[train_index]
y_test = y_labels.iloc[test_index]

Reduce Dimensionality:

    Use TfidfVectorizer with parameters like max_features to limit the number of features (e.g., max_features=5000). This reduces the encoding size and speeds up training and testing.
    
Use N-grams:

    Incorporate n-grams (e.g., bigrams or trigrams) to capture more context. This can improve precision and recall.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

count_array_tfidf = X_train_tfidf.toarray()
df2 = pd.DataFrame(data=count_array_tfidf,columns = tfidf_vectorizer.get_feature_names_out())
df2

# Example: Assuming y_train and y_test are string labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
from sklearn.neural_network import MLPClassifier


model1 = MLPClassifier(hidden_layer_sizes=(50, 25), activation='tanh', max_iter=500, solver='adam', random_state=1)

model2 = MLPClassifier(hidden_layer_sizes=(50, 25),activation='relu' , max_iter=200, solver='adam', alpha=0.01, random_state=1)


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

num_classes = 3


input_dim = X_train_tfidf.shape[1]
model = keras.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.15),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.15),
    layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0005),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)



In [6]:

from sklearn.metrics import classification_report, accuracy_score
import pickle
import os
from tensorflow.keras.utils import to_categorical

# Debugging: Check the input data shape before training
print("Input data shape before training:", X_train.shape)

y_train_one_hot = to_categorical(y_train, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test, num_classes=num_classes)

# Train the model
model.fit(X_train_tfidf.toarray(), y_train_one_hot, epochs=50, batch_size=32, validation_data=(X_test_tfidf.toarray(), y_test_one_hot))

# Make predictions
predictions = model.predict(X_test_tfidf)

# Convert probabilities to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predicted_labels))
print("Classification Report:\n", classification_report(y_test, predicted_labels, target_names=label_encoder.classes_))



#save the model
model.save('models_tp3/spooky_author_model.h5')

Input data shape before training: (13052,)
Epoch 1/50
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 74ms/step - accuracy: 0.6068 - loss: 0.9748 - val_accuracy: 0.7593 - val_loss: 0.7109
Epoch 2/50
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 68ms/step - accuracy: 0.9454 - loss: 0.1657 - val_accuracy: 0.8142 - val_loss: 0.4910
Epoch 3/50
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 68ms/step - accuracy: 0.9876 - loss: 0.0479 - val_accuracy: 0.8145 - val_loss: 0.5838
Epoch 4/50
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 68ms/step - accuracy: 0.9924 - loss: 0.0270 - val_accuracy: 0.8082 - val_loss: 0.6535
Epoch 5/50
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 69ms/step - accuracy: 0.9948 - loss: 0.0212 - val_accuracy: 0.8028 - val_loss: 0.7281
Epoch 6/50
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 69ms/step - accuracy: 0.9942 - loss: 0.0202 - val_accuracy:



Accuracy: 0.7865788264133599
Classification Report:
               precision    recall  f1-score   support

         EAP       0.78      0.83      0.80      2634
         HPL       0.81      0.74      0.77      1878
         MWS       0.78      0.78      0.78      2015

    accuracy                           0.79      6527
   macro avg       0.79      0.78      0.78      6527
weighted avg       0.79      0.79      0.79      6527

