In [6]:
import pandas as pd
import numpy as np
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Dropout, Embedding, LSTM, Bidirectional, SpatialDropout1D
from sklearn.metrics import classification_report, accuracy_score
from keras.callbacks import EarlyStopping


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t', header=None, names=['ID', 'SPELL', 'TEXT'])
    return data

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

train_data = load_data(r'hpac_corpus\hpac_training_128.tsv')
test_data = load_data(r'hpac_corpus\hpac_test_128.tsv')
dev_data = load_data(r'hpac_corpus\hpac_dev_128.tsv')

train_data['TEXT'] = train_data['TEXT'].apply(preprocess_text)
test_data['TEXT'] = test_data['TEXT'].apply(preprocess_text)
dev_data['TEXT'] = dev_data['TEXT'].apply(preprocess_text)


label_encoder = LabelEncoder()
label_encoder.fit(pd.concat([train_data['SPELL'], test_data['SPELL'], dev_data['SPELL']]))

train_data['SPELL'] = label_encoder.transform(train_data['SPELL'])
test_data['SPELL'] = label_encoder.transform(test_data['SPELL'])
dev_data['SPELL'] = label_encoder.transform(dev_data['SPELL'])


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit(pd.concat([train_data['TEXT'], test_data['TEXT'], dev_data['TEXT']]))

X_train_tfidf = tfidf_vectorizer.transform(train_data['TEXT'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['TEXT'])
X_dev_tfidf = tfidf_vectorizer.transform(dev_data['TEXT'])


max_length = 500  
X_train_seq = pad_sequences(X_train_tfidf.toarray(), maxlen=max_length)
X_test_seq = pad_sequences(X_test_tfidf.toarray(), maxlen=max_length)
X_dev_seq = pad_sequences(X_dev_tfidf.toarray(), maxlen=max_length)


model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_length),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, return_sequences=True)),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

y_train_encoded = np.eye(len(label_encoder.classes_))[train_data['SPELL']]
y_test_encoded = np.eye(len(label_encoder.classes_))[test_data['SPELL']]
y_dev_encoded = np.eye(len(label_encoder.classes_))[dev_data['SPELL']]

early_stop = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train_seq, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test_seq, y_test_encoded), callbacks=[early_stop])

y_pred = model.predict(X_test_seq)
y_pred_labels = np.argmax(y_pred, axis=1)


missing_classes = set(range(len(label_encoder.classes_))) - set(test_data['SPELL'])

print(f"Missing classes in predictions: {missing_classes}")


if missing_classes:
    for cls in missing_classes:
        y_pred_labels = np.append(y_pred_labels, cls)
        test_data = test_data.copy()
        test_data.loc[len(test_data)] = [0, cls, ""]

report = classification_report(test_data['SPELL'], y_pred_labels, target_names=label_encoder.classes_, output_dict=True)
accuracy = accuracy_score(test_data['SPELL'], y_pred_labels)

print("Classification Report:")
print(classification_report(test_data['SPELL'], y_pred_labels, target_names=label_encoder.classes_))
print(f"Accuracy: {accuracy}")


metrics_df = pd.DataFrame(report).transpose()
 
metrics_df.to_csv('classification_report.csv', index=True)
print("Metrics for all classes saved to classification_report.csv")


model.save('spell_classifier_model_v1_0.h5')
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer_v1_0.pkl')
joblib.dump(label_encoder, 'label_encoder_v1_0.pkl')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10




[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 155ms/step - accuracy: 0.0903 - loss: 0.0951 - val_accuracy: 0.1192 - val_loss: 0.0524
Epoch 2/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 153ms/step - accuracy: 0.1214 - loss: 0.0532 - val_accuracy: 0.1192 - val_loss: 0.0523
Epoch 3/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 157ms/step - accuracy: 0.1217 - loss: 0.0528 - val_accuracy: 0.1192 - val_loss: 0.0523
Epoch 4/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 156ms/step - accuracy: 0.1232 - loss: 0.0527 - val_accuracy: 0.1192 - val_loss: 0.0523
Epoch 5/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 158ms/step - accuracy: 0.1225 - loss: 0.0526 - val_accuracy: 0.1184 - val_loss: 0.0523
Epoch 6/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 156ms/step - accuracy: 0.1201 - loss: 0.0525 - val_accuracy: 0.1192 - val_loss: 0.0522
Epo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Missing classes in predictions: {55}
Classification Report:
                       precision    recall  f1-score   support

                ACCIO       0.00      0.00      0.00       516
            AGUAMENTI       0.00      0.00      0.00        79
            ALOHOMORA       0.00      0.00      0.00       164
              ANAPNEO       0.00      0.00      0.00         4
            APARECIUM       0.00      0.00      0.00         9
        AVADA_KEDAVRA       0.00      0.00      0.00       915
                 AVIS       0.00      0.00      0.00        27
        CAVE_INIMICUM       0.00      0.00      0.00         2
          COLLOPORTUS       0.00      0.00      0.00        39
            CONFRINGO       0.00      0.00      0.00        43
             CONFUNDO       0.00      0.00      0.00        17
               CRUCIO       0.12      1.00      0.21       909
              DEFODIO       0.00      0.00      0.00         6
            DELETRIUS       0.00      0.00      0.00     

['label_encoder_v1_0.pkl']