In [157]:
import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC as SupportVectorClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Concatenate, Embedding, Input, Bidirectional, Dropout
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

import sys
import re
import numpy as np
import pandas as pd
import random

from collections import deque, Counter
from itertools import permutations, repeat, combinations




In [158]:
empires = []
vectorizer = CountVectorizer()
min_sentence_length = 5
level_sample_sizes = True

In [159]:
with open("Resources/Data/EmpireText.txt", "r", encoding="utf8") as file:
    line = file.readline().replace("\n", "").split(" ")
    for empire in line:
        empires.append(empire)

In [160]:
text = []
labels = []
with open("Resources/Data/EmpireText.txt", "r", encoding="utf8") as file: 
    for line in file.readlines()[1:]:
        line = line.strip().replace("\n", "")
        line = re.sub("\[.{0,4}]", "", line) #remove wikipedia citings


        if line in empires:
            current_empire = line
            continue
        elif len(line.replace(" ", "")) < 15:
            continue
        elif line[-1] != ".":
            line += "."
    
        line = nltk.sent_tokenize(line)
        for sentence in line:
            text.append(sentence)
            labels.append(current_empire)
            
data = pd.DataFrame({"label":labels, "text":text})

# Data Visualization

## Training Data

In [161]:
data.head()

Unnamed: 0,label,text
0,Roman,"The Roman Empire, the ancient empire, centred ..."
1,Spanish,"The Spanish Empire (Spanish: Imperio español),..."
2,Russian,"The Russian Empire, also known as Imperial Rus..."
3,Roman,A period of unrest and civil wars in the 1st c...
4,Roman,This period encompassed the career of Julius C...


In [162]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3238 entries, 0 to 3237
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3238 non-null   object
 1   text    3238 non-null   object
dtypes: object(2)
memory usage: 50.7+ KB


In [163]:
training_value_counts = data["label"].value_counts()
training_value_counts

Roman      1308
Russian     993
Spanish     937
Name: label, dtype: int64

# Text classification:
### The process of text classification is comprised of 4 main steps
#### - Preprocessing the text
#### - Encoding labels
#### - Vectorizing the text
#### - Training the model(s)
***

## Text preprocessing
### Steps:
- Lowercasing
- Tokenization
- POS tagging
- Lemmatization

Note: When tested, the models showed better results without the removal of stopwords

### Example:

In [164]:
example_paragraph = data['text'][1]
example_paragraph = "".join(example_paragraph)
print(data['label'][1], "empire:")
example_paragraph

Spanish empire:


'The Spanish Empire (Spanish: Imperio español), also known as the Hispanic Monarchy (Spanish: Monarquía Hispánica) or the Catholic Monarchy (Spanish: Monarquía Católica) was a colonial empire governed by Spain and its predecessor states between 1492 and 1976.'

In [165]:
lowercase_paragraph = example_paragraph.lower()
lowercase_paragraph

'the spanish empire (spanish: imperio español), also known as the hispanic monarchy (spanish: monarquía hispánica) or the catholic monarchy (spanish: monarquía católica) was a colonial empire governed by spain and its predecessor states between 1492 and 1976.'

In [166]:
tokenized_paragraph = nltk.word_tokenize(lowercase_paragraph)
for word in tokenized_paragraph:
    print(f"'{word}'", end=", ")

'the', 'spanish', 'empire', '(', 'spanish', ':', 'imperio', 'español', ')', ',', 'also', 'known', 'as', 'the', 'hispanic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'hispánica', ')', 'or', 'the', 'catholic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'católica', ')', 'was', 'a', 'colonial', 'empire', 'governed', 'by', 'spain', 'and', 'its', 'predecessor', 'states', 'between', '1492', 'and', '1976', '.', 

In [167]:
tagged_paragraph = nltk.pos_tag(tokenized_paragraph)
for word in tagged_paragraph:
    print(word, end=', ')

('the', 'DT'), ('spanish', 'JJ'), ('empire', 'NN'), ('(', '('), ('spanish', 'JJ'), (':', ':'), ('imperio', 'NN'), ('español', 'NN'), (')', ')'), (',', ','), ('also', 'RB'), ('known', 'VBN'), ('as', 'IN'), ('the', 'DT'), ('hispanic', 'JJ'), ('monarchy', 'NN'), ('(', '('), ('spanish', 'JJ'), (':', ':'), ('monarquía', 'NN'), ('hispánica', 'NN'), (')', ')'), ('or', 'CC'), ('the', 'DT'), ('catholic', 'JJ'), ('monarchy', 'NN'), ('(', '('), ('spanish', 'JJ'), (':', ':'), ('monarquía', 'NN'), ('católica', 'NN'), (')', ')'), ('was', 'VBD'), ('a', 'DT'), ('colonial', 'JJ'), ('empire', 'NN'), ('governed', 'VBN'), ('by', 'IN'), ('spain', 'NN'), ('and', 'CC'), ('its', 'PRP$'), ('predecessor', 'NN'), ('states', 'NNS'), ('between', 'IN'), ('1492', 'CD'), ('and', 'CC'), ('1976', 'CD'), ('.', '.'), 

In [168]:
def get_pos(tag):    
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN
        
wnl = WordNetLemmatizer()
lemmatized_sentence = [wnl.lemmatize(word, get_pos(pos)) for word, pos in tagged_paragraph]
for word in lemmatized_sentence:
    print(f"'{word}'", end=", ")

'the', 'spanish', 'empire', '(', 'spanish', ':', 'imperio', 'español', ')', ',', 'also', 'know', 'a', 'the', 'hispanic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'hispánica', ')', 'or', 'the', 'catholic', 'monarchy', '(', 'spanish', ':', 'monarquía', 'católica', ')', 'be', 'a', 'colonial', 'empire', 'govern', 'by', 'spain', 'and', 'it', 'predecessor', 'state', 'between', '1492', 'and', '1976', '.', 

### Application

In [169]:
sentences = []
labels = []
wnl = WordNetLemmatizer()
for row in data.iterrows():
    label = row[1]['label']
    text = row[1]['text']
    text = nltk.word_tokenize(text)
    text = nltk.pos_tag(text)
    text = [wnl.lemmatize(word, get_pos(pos)) for word, pos in text]
    sentences.append(text)
    labels.append(label)

preprocessed_data = pd.DataFrame({"label":labels, "text":sentences})

In [170]:
preprocessed_data.head()

Unnamed: 0,label,text
0,Roman,"[The, Roman, Empire, ,, the, ancient, empire, ..."
1,Spanish,"[The, Spanish, Empire, (, Spanish, :, Imperio,..."
2,Russian,"[The, Russian, Empire, ,, also, know, a, Imper..."
3,Roman,"[A, period, of, unrest, and, civil, war, in, t..."
4,Roman,"[This, period, encompass, the, career, of, Jul..."


## Label Encoding

In [171]:
labels = []
for label in preprocessed_data['label']:
    labels.append(empires.index(label))
preprocessed_data['label'] = labels

In [172]:
preprocessed_data.head()

Unnamed: 0,label,text
0,0,"[The, Roman, Empire, ,, the, ancient, empire, ..."
1,1,"[The, Spanish, Empire, (, Spanish, :, Imperio,..."
2,2,"[The, Russian, Empire, ,, also, know, a, Imper..."
3,0,"[A, period, of, unrest, and, civil, war, in, t..."
4,0,"[This, period, encompass, the, career, of, Jul..."


## Evening training data
Data with varied distribution will perform worse unless accounted for

In [173]:
label_counts = Counter(preprocessed_data["label"])
max_len = min(label_counts.values())
training_data = preprocessed_data.sample(frac=1).groupby('label').head(max_len)
labels = training_data['label']
text = training_data['text']
value_counts = training_data['label'].value_counts()
value_counts

0    937
2    937
1    937
Name: label, dtype: int64

## Text Vectorization

In [174]:
vectorizer = CountVectorizer()
text = []
for sentence in preprocessed_data['text']:
    text.append(" ".join(sentence))
vectorizer.fit(text)
vectorized_text = vectorizer.transform(text)

## Training the models

In [175]:
uniques = set()
for sentence in data['text']:
    for word in sentence:
        uniques.add(word)
num_uniques = len(uniques)

In [176]:
text = vectorized_text
labels = preprocessed_data['label']
x_train, x_test, y_train, y_test = train_test_split(text, labels, train_size=0.9)

#### Decision Tree

In [177]:
decision_tree = DecisionTreeClassifier(random_state=2, max_depth=150, max_features=1000)
decision_tree.fit(x_train, y_train)
decision_tree_score = decision_tree.score(x_test, y_test)
decision_tree_score

0.808641975308642

#### Random Forest

In [178]:
random_forest = RandomForestClassifier(random_state=0, n_estimators=300, max_depth=150, max_features=1000)
random_forest.fit(x_train, y_train)
random_forest_score = random_forest.score(x_test, y_test)
random_forest_score

0.8765432098765432

#### Support Vector Machine

In [179]:
best_c = 0.1
best_kernel = "linear"
support_vector_machine = SupportVectorClassifier(kernel=best_kernel, C=best_c)
support_vector_machine.fit(x_train, y_train)
support_vector_machine_score = support_vector_machine.score(x_test, y_test)
support_vector_machine_score

0.8981481481481481

### Long Short Term Memory

Sidenote: while one could convert x_Train from SparseTensor to array it causes the lstm model to have low accuracy and take 20 min per epoch, so its better to just format the data with tensorflow methods to convert it to a format that tensorflow is better equipped to handle.

In [180]:
text = preprocessed_data['text']
labels = preprocessed_data['label']
longest_sentence = max(text, key=lambda x: len(x))
encoded_sentences = [one_hot(" ".join(sentence), num_uniques) for sentence in text]
padded_sequences = pad_sequences(encoded_sentences, maxlen=len(longest_sentence), padding='post')

def encode_labels(labels):
    output = []
    for label in labels:
        label_array = np.zeros(len(empires))
        label_array[label] = 1    
        output.append(label_array)
    return output

labels = encode_labels(labels)
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=.1, random_state=85)
print("input example:", x_train[0])
print("label example:", y_train[0])

input example: [ 50  53  25  60 114  11  71  11  21  59  29   5  68  29 103 112  75  40
   1 112  22  39  10 111  46 111  76  76  25  55  56  45  69  25  19   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0]
label example: [1. 0. 0.]


In [181]:
lstm = Sequential()
lstm.add(Embedding(num_uniques, 32))
lstm.add(Bidirectional(LSTM(100)))
lstm.add(Dense(256, activation="relu"))
lstm.add(Dropout(0.5))
lstm.add(Dense(128))
lstm.add(Dense(3, activation="sigmoid"))
optimizer = Adam(learning_rate=0.03)
lstm.compile(optimizer="adam", loss='categorical_crossentropy', metrics=["accuracy"])
lstm.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 32)          3744      
                                                                 
 bidirectional_4 (Bidirectio  (None, 200)              106400    
 nal)                                                            
                                                                 
 dense_12 (Dense)            (None, 256)               51456     
                                                                 
 dropout_4 (Dropout)         (None, 256)               0         
                                                                 
 dense_13 (Dense)            (None, 128)               32896     
                                                                 
 dense_14 (Dense)            (None, 3)                 387       
                                                      

In [182]:
lstm.fit(np.array(x_train), np.array(y_train), epochs=60, verbose=1, batch_size=32, callbacks=[])

Epoch 1/60


In [None]:
lstm_score = lstm.evaluate(x_test, np.array(y_test), verbose=1)
lstm_acc = lstm_score[1]*100
print(f'Test accuracy: {lstm_acc}')

Test accuracy: 99.38271641731262
