In [125]:
import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC as SupportVectorClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Concatenate, Embedding, Input, Bidirectional, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential, Model
from keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow_addons.activations import mish
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import sys
import re
import numpy as np
import pandas as pd
import random
import contractions

from collections import deque, Counter
from itertools import permutations, repeat, combinations




In [94]:
empires = []
vectorizer = CountVectorizer()
min_sentence_length = 7 # average length of a sentence is between 15 and 20 words
level_sample_sizes = True

In [95]:
with open("Resources/Data/EmpireText.txt", "r", encoding="utf8") as file:
    line = file.readline().replace("\n", "").split(" ")
    for empire in line:
        empires.append(empire)

In [96]:
def format_sentence(sentence, current_empire):
        if len(sentence.split(" ")) > min_sentence_length:
            sentence = contractions.fix(sentence)
            contains_other_empire = False
            for empire in empires:
                if empire == current_empire:
                    continue
                elif re.search(empire.lower(), sentence.lower()) is not None:
                    contains_other_empire = True

            if not contains_other_empire:
                return sentence.strip()

text = []
labels = []
with open("Resources/Data/EmpireText.txt", "r", encoding="utf8") as file: 
    for line in file.readlines()[1:]:
        line = line.strip().replace("\n", "")
        line = re.sub("\[.{0,4}]", "", line) #remove wikipedia citings
        line = line.replace("C.", "C")
        line = line.replace("E.", "E")
        line = line.replace("D.", "D")
        
        if line in empires:
            current_empire = line
            continue
        elif len(line.replace(" ", "")) < min_sentence_length:
            continue
        elif len(line.split(" ")) < min_sentence_length:
            continue
        elif line[-1] != ".":
            line += "."
    
        line = nltk.sent_tokenize(line)
        formatted_line = ""
        for sentence in line:
            sentence = format_sentence(sentence, current_empire)
            if sentence is not None:
                formatted_line += sentence + " "
        text.append(formatted_line)
        labels.append(current_empire)
            
data = pd.DataFrame({"label":labels, "text":text})

# Data Visualization

## Training Data

In [97]:
data.head()

Unnamed: 0,label,text
0,Roman,"The Roman Empire, the ancient empire, centred ..."
1,Spanish,The Spanish Empire (Spanish: Imperio españold)...
2,Russian,"The Russian Empire, also known as Imperial Rus..."
3,Roman,A period of unrest and civil wars in the 1st c...
4,Roman,Augustus established a form of government know...


In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696 entries, 0 to 695
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   696 non-null    object
 1   text    696 non-null    object
dtypes: object(2)
memory usage: 11.0+ KB


In [99]:
training_value_counts = data["label"].value_counts()
training_value_counts

Roman      319
Spanish    228
Russian    149
Name: label, dtype: int64

# Text classification:
### The process of text classification is comprised of 4 main steps
#### - Preprocessing the text
#### - Encoding labels
#### - Vectorizing the text
#### - Training the model(s)
***

## Text preprocessing
### Steps:
- Lowercasing
- Tokenization
- POS tagging
- Lemmatization

Note: When tested, the models showed better results without the removal of stopwords

### Example:

In [100]:
example_paragraph = data['text'][1]
example_paragraph = "".join(example_paragraph)
print(data['label'][1], "empire:")
example_paragraph

Spanish empire:


'The Spanish Empire (Spanish: Imperio españold), also known as the Hispanic Monarchy (Spanish: Monarquía Hispánica) or the Catholic Monarchy (Spanish: Monarquía Católica) was a colonial empire governed by Spain and its predecessor states between 1492 and 1976. '

In [101]:
lowercase_paragraph = example_paragraph.lower()
lowercase_paragraph

'the spanish empire (spanish: imperio españold), also known as the hispanic monarchy (spanish: monarquía hispánica) or the catholic monarchy (spanish: monarquía católica) was a colonial empire governed by spain and its predecessor states between 1492 and 1976. '

In [102]:
tokenized_paragraph = nltk.word_tokenize(lowercase_paragraph)
for word in tokenized_paragraph:
    print(f"'{word}'", end=", ")

'the', 'spanish', 'empire', '(', 'spanish', ':', 'imperio', 'españold', ')', ',', 'also', 'known', 'as', 'the', 'hispanic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'hispánica', ')', 'or', 'the', 'catholic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'católica', ')', 'was', 'a', 'colonial', 'empire', 'governed', 'by', 'spain', 'and', 'its', 'predecessor', 'states', 'between', '1492', 'and', '1976', '.', 

In [103]:
tagged_paragraph = nltk.pos_tag(tokenized_paragraph)
for word in tagged_paragraph:
    print(word, end=', ')

('the', 'DT'), ('spanish', 'JJ'), ('empire', 'NN'), ('(', '('), ('spanish', 'JJ'), (':', ':'), ('imperio', 'NN'), ('españold', 'NN'), (')', ')'), (',', ','), ('also', 'RB'), ('known', 'VBN'), ('as', 'IN'), ('the', 'DT'), ('hispanic', 'JJ'), ('monarchy', 'NN'), ('(', '('), ('spanish', 'JJ'), (':', ':'), ('monarquía', 'NN'), ('hispánica', 'NN'), (')', ')'), ('or', 'CC'), ('the', 'DT'), ('catholic', 'JJ'), ('monarchy', 'NN'), ('(', '('), ('spanish', 'JJ'), (':', ':'), ('monarquía', 'NN'), ('católica', 'NN'), (')', ')'), ('was', 'VBD'), ('a', 'DT'), ('colonial', 'JJ'), ('empire', 'NN'), ('governed', 'VBN'), ('by', 'IN'), ('spain', 'NN'), ('and', 'CC'), ('its', 'PRP$'), ('predecessor', 'NN'), ('states', 'NNS'), ('between', 'IN'), ('1492', 'CD'), ('and', 'CC'), ('1976', 'CD'), ('.', '.'), 

In [104]:
def get_pos(tag):    
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN
        
wnl = WordNetLemmatizer()
lemmatized_sentence = [wnl.lemmatize(word, get_pos(pos)) for word, pos in tagged_paragraph]
for word in lemmatized_sentence:
    print(f"'{word}'", end=", ")

'the', 'spanish', 'empire', '(', 'spanish', ':', 'imperio', 'españold', ')', ',', 'also', 'know', 'a', 'the', 'hispanic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'hispánica', ')', 'or', 'the', 'catholic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'católica', ')', 'be', 'a', 'colonial', 'empire', 'govern', 'by', 'spain', 'and', 'it', 'predecessor', 'state', 'between', '1492', 'and', '1976', '.', 

### Application

In [105]:
sentences = []
labels = []
wnl = WordNetLemmatizer()
for row in data.iterrows():
    label = row[1]['label']
    text = row[1]['text']
    text = nltk.word_tokenize(text)
    text = nltk.pos_tag(text)
    text = [wnl.lemmatize(word, get_pos(pos)) for word, pos in text]
    sentences.append(text)
    labels.append(label)

preprocessed_data = pd.DataFrame({"label":labels, "text":sentences})

In [106]:
preprocessed_data.head()

Unnamed: 0,label,text
0,Roman,"[The, Roman, Empire, ,, the, ancient, empire, ..."
1,Spanish,"[The, Spanish, Empire, (, Spanish, :, Imperio,..."
2,Russian,"[The, Russian, Empire, ,, also, know, a, Imper..."
3,Roman,"[A, period, of, unrest, and, civil, war, in, t..."
4,Roman,"[Augustus, establish, a, form, of, government,..."


## Label Encoding

In [107]:
labels = []
for label in preprocessed_data['label']:
    labels.append(empires.index(label))
preprocessed_data['label'] = labels

In [108]:
preprocessed_data.head()

Unnamed: 0,label,text
0,0,"[The, Roman, Empire, ,, the, ancient, empire, ..."
1,1,"[The, Spanish, Empire, (, Spanish, :, Imperio,..."
2,2,"[The, Russian, Empire, ,, also, know, a, Imper..."
3,0,"[A, period, of, unrest, and, civil, war, in, t..."
4,0,"[Augustus, establish, a, form, of, government,..."


## Text Vectorization

In [109]:
vectorizer = CountVectorizer()
text = []
for sentence in preprocessed_data['text']:
    text.append(" ".join(sentence))
vectorizer.fit(text)
vectorized_text = vectorizer.transform(text)

## Training the models

In [110]:
uniques = set()
for sentence in data['text']:
    for word in sentence:
        uniques.add(word)
num_uniques = len(uniques)

In [111]:
text = vectorized_text
labels = preprocessed_data['label']
x_train, x_test, y_train, y_test = train_test_split(text, labels, train_size=0.9)

#### Decision Tree

In [112]:
decision_tree = DecisionTreeClassifier(random_state=2, max_depth=150, max_features=1000)
decision_tree.fit(x_train, y_train)
decision_tree_score = decision_tree.score(x_test, y_test)
decision_tree_score

0.9571428571428572

#### Random Forest

In [113]:
random_forest = RandomForestClassifier(random_state=0, n_estimators=300, max_depth=150, max_features=1000)
random_forest.fit(x_train, y_train)
random_forest_score = random_forest.score(x_test, y_test)
random_forest_score

0.9857142857142858

#### Support Vector Machine

In [114]:
best_c = 0.1
best_kernel = "linear"
support_vector_machine = SupportVectorClassifier(kernel=best_kernel, C=best_c)
support_vector_machine.fit(x_train, y_train)
support_vector_machine_score = support_vector_machine.score(x_test, y_test)
support_vector_machine_score

1.0

### Long Short Term Memory

Sidenote: while one could convert x_Train from SparseTensor to array it causes the lstm model to have low accuracy and take 20 min per epoch, so its better to just format the data with tensorflow methods to convert it to a format that tensorflow is better equipped to handle.

In [115]:
text = preprocessed_data['text']
labels = preprocessed_data['label']
longest_sentence = max(text, key=lambda x: len(x))
encoded_sentences = [one_hot(" ".join(sentence), round(num_uniques*1.2)) for sentence in text]
padded_sequences = pad_sequences(encoded_sentences, maxlen=len(longest_sentence), padding='post')

def encode_labels(labels):
    output = []
    for label in labels:
        label_array = np.zeros(len(empires))
        label_array[label] = 1    
        output.append(label_array)
    return output

labels = encode_labels(labels)
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=.33, random_state=463)
print("input example:", x_train[0])
print("label example:", y_train[0])

input example: [ 83  46  90  38  95   1 127 120  50  97  51  83  21  44 107  50 127 127
 119   1 127  25  20 100  83  15  65  41  66  79  85 116 127 103  38  83
 114 113  44  53  35  39  83  69  77 119  83  36 115  63  28  79  38 107
  89  79  31  12  51  83 103 117  83  76  63  96  84  38  83  94 127  65
 102  26 124  15  19  20 112  40  52   1  15  69 127 128  50  85  20  38
 107  52  93  83  63  79 130  35  57  14 113  36  51  83 132  20  33  77
  69  63  76 132  35  60  42 132  83  63  38  35  93  47  21  52  79  47
  24  78  17  31  27 132  50 138 134   1  83  55  44  93 106  20  27  51
  83  24  90  83   1  44  50 120  83  90  70  56  20 112  42  83  12  11
  44  24 106  51  16 119  83  50 108  79  94  57  34   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0

In [116]:
print(len(padded_sequences))

696


In [117]:
class Lstm(Model):
    def __init__(self, uniques) -> None:
        super().__init__()

        self.l1 = Embedding(uniques, 64)
        self.l2 = Bidirectional(LSTM(16))
        self.l3 = Dropout(0.2)
        self.l4 = Dense(128)
        self.l6 = Dense(3, activation="sigmoid")
    
    def call(self, inp):
        x = self.l1(inp)
        x = self.l2(x)
        x = self.l3(x)
        x = self.l4(x)
        x = mish(x)
        x = self.l6(x)
        return x

lstm = Lstm(num_uniques)
lstm.compile(optimizer="adam", loss='categorical_crossentropy', metrics=["accuracy"])



In [118]:
lstm = Sequential()
lstm.add(Embedding(num_uniques, 128))
lstm.add(Bidirectional(LSTM(4)))
lstm.add(Dropout(0.1))
lstm.add(Dense(3, activation="sigmoid"))
lstm.compile(optimizer="adam", loss='categorical_crossentropy', metrics=["accuracy"])

In [127]:
lstm = Sequential()
lstm.add(Embedding(round(num_uniques*1.2), 128))
lstm.add(Bidirectional(LSTM(32)))
lstm.add(Dense(256, activation="relu"))
lstm.add(Dropout(0.5))
lstm.add(Dense(128))
lstm.add(Dense(3, activation="sigmoid"))
opt = Adam(learning_rate=0.00001)
lstm.compile(optimizer=opt, loss='categorical_crossentropy', metrics=["accuracy"])

In [131]:
es = EarlyStopping(patience=10, restore_best_weights=True)
mc = ModelCheckpoint('best-weights.h5', monitor='val_loss', save_best_only=True, save_weights_only=True)
lstm.fit(np.array(x_train), np.array(y_train), epochs=100, verbose=1, batch_size=2, callbacks=[es, mc], validation_data=(x_test, np.array(y_test)))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


<keras.callbacks.History at 0x1ed46e06050>

In [123]:
lstm_score = lstm.evaluate(x_test, np.array(y_test), verbose=1)
lstm_acc = lstm_score[1]*100
print(f'Test accuracy: {lstm_acc}')

Test accuracy: 64.7826075553894
