In [8]:
import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC as SupportVectorClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Concatenate, Embedding, Input, Bidirectional, Dropout
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

import sys
import re
import numpy as np
import pandas as pd
import random

from collections import deque, Counter
from itertools import permutations, repeat, combinations


In [3]:
empires = []
vectorizer = CountVectorizer()
min_sentence_length = 5
level_sample_sizes = True

In [4]:
with open("Resources/Data/EmpireText.txt", "r", encoding="utf8") as file:
    line = file.readline().replace("\n", "").split(" ")
    for empire in line:
        empires.append(empire)

In [5]:
text = []
labels = []
with open("Resources/Data/EmpireText.txt", "r", encoding="utf8") as file: 
    for line in file.readlines()[1:]:
        line = line.strip().replace("\n", "")
        line = re.sub("\[.{0,4}]", "", line) #remove wikipedia citings


        if line in empires:
            current_empire = line
            continue
        elif len(line.replace(" ", "")) < 15:
            continue
        elif line[-1] != ".":
            line += "."
    
        line = nltk.sent_tokenize(line)
        for sentence in line:
            text.append(sentence)
            labels.append(current_empire)
            
data = pd.DataFrame({"label":labels, "text":text})

# Paragraphing the text
Because the data came in paragraphs, many sentences have no coorelation with any empire, because of this, the models will not be able to a correct conclusion for these sentences. Combining several sentences into a paragraph, and using that paragraph will allow for a model to overcome this obstacle. It will also give us more training samples since were taking the permutations

In [6]:
dataframes = [df for _, df in data.groupby('label')]
print(dataframes)

[        label                                               text
1     Ottoman  Known as one of history’s most powerful empire...
1311  Ottoman  The empire’s success lay in its centralized st...
1312  Ottoman  But all empires that rise must fall, and six c...
1313  Ottoman  Osman I, a leader of a nomadic Turkic tribe fr...
1314  Ottoman  Around 1299, he declared himself supreme leade...
...       ...                                                ...
2253  Ottoman  In the 19th century, Ishak Efendi is credited ...
2254  Ottoman  The main sports Ottomans were engaged in were ...
2255  Ottoman  European model sports clubs were formed with t...
2256  Ottoman  The leading clubs, according to timeline, were...
2257  Ottoman  Football clubs were formed in other provinces ...

[948 rows x 2 columns],       label                                               text
0     Roman  The Roman Empire, the ancient empire, centred ...
4     Roman  A period of unrest and civil wars in the 1st c...
5    

In [14]:
paragraphs = []
labels = []
for df in dataframes:
    label = df['label'].tolist()[0]
    sentences = df['text']
    paragraph_permutations = list(permutations(sentences, 2))
    paragraphs.extend(paragraph_permutations)
    labels.extend(list(repeat(label, len(paragraph_permutations))))

data = pd.DataFrame({'label':labels, 'text': paragraphs})
    

# Data Visualization

In [15]:
data.head()

Unnamed: 0,label,text
0,Ottoman,(Known as one of history’s most powerful empir...
1,Ottoman,(Known as one of history’s most powerful empir...
2,Ottoman,(Known as one of history’s most powerful empir...
3,Ottoman,(Known as one of history’s most powerful empir...
4,Ottoman,(Known as one of history’s most powerful empir...


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4469400 entries, 0 to 4469399
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   label   object
 1   text    object
dtypes: object(2)
memory usage: 68.2+ MB


In [18]:
value_counts = data["label"].value_counts()
value_counts

Roman      1709556
Russian     985056
Ottoman     897756
Spanish     877032
Name: label, dtype: int64

# Text classification:
### The process of text classification is comprised of 4 main steps
#### - Preprocessing the text
#### - Encoding labels
#### - Vectorizing the text
#### - Training the model(s)
***

## Text preprocessing
### Steps:
- Lowercasing
- Tokenization
- POS tagging
- Lemmatization

Note: When tested, the models showed better results without the removal of stopwords

### Example:

In [22]:
example_paragraph = data['text'][5]
example_paragraph = " ".join(example_paragraph)
example_paragraph

'Known as one of history’s most powerful empires, the Ottoman Empire grew from a Turkish stronghold in Anatolia into a vast state that at its peak reached as far north as Vienna, Austria, as far east as the Persian Gulf, as far west as Algeria, and as far south as Yemen. The city named for Constantine, the first Christian emperor of Rome, then also became known as Istanbul (a version of stin polis, Greek for “in the city” or “to the city.”.'

In [26]:
lowercase_paragraph = example_paragraph.lower()
lowercase_paragraph

'known as one of history’s most powerful empires, the ottoman empire grew from a turkish stronghold in anatolia into a vast state that at its peak reached as far north as vienna, austria, as far east as the persian gulf, as far west as algeria, and as far south as yemen. the city named for constantine, the first christian emperor of rome, then also became known as istanbul (a version of stin polis, greek for “in the city” or “to the city.”.'

In [27]:
tokenized_paragraph = nltk.word_tokenize(lowercase_paragraph)
for word in tokenized_paragraph:
    print(f"'{word}'", end=", ")

'known', 'as', 'one', 'of', 'history', '’', 's', 'most', 'powerful', 'empires', ',', 'the', 'ottoman', 'empire', 'grew', 'from', 'a', 'turkish', 'stronghold', 'in', 'anatolia', 'into', 'a', 'vast', 'state', 'that', 'at', 'its', 'peak', 'reached', 'as', 'far', 'north', 'as', 'vienna', ',', 'austria', ',', 'as', 'far', 'east', 'as', 'the', 'persian', 'gulf', ',', 'as', 'far', 'west', 'as', 'algeria', ',', 'and', 'as', 'far', 'south', 'as', 'yemen', '.', 'the', 'city', 'named', 'for', 'constantine', ',', 'the', 'first', 'christian', 'emperor', 'of', 'rome', ',', 'then', 'also', 'became', 'known', 'as', 'istanbul', '(', 'a', 'version', 'of', 'stin', 'polis', ',', 'greek', 'for', '“', 'in', 'the', 'city', '”', 'or', '“', 'to', 'the', 'city.', '”', '.', 

In [28]:
tagged_paragraph = nltk.pos_tag(tokenized_paragraph)
for word in tagged_paragraph:
    print(word, end=', ')

('known', 'VBN'), ('as', 'IN'), ('one', 'CD'), ('of', 'IN'), ('history', 'NN'), ('’', 'NNP'), ('s', 'VBZ'), ('most', 'RBS'), ('powerful', 'JJ'), ('empires', 'NNS'), (',', ','), ('the', 'DT'), ('ottoman', 'NN'), ('empire', 'NN'), ('grew', 'VBD'), ('from', 'IN'), ('a', 'DT'), ('turkish', 'JJ'), ('stronghold', 'NN'), ('in', 'IN'), ('anatolia', 'NN'), ('into', 'IN'), ('a', 'DT'), ('vast', 'JJ'), ('state', 'NN'), ('that', 'WDT'), ('at', 'IN'), ('its', 'PRP$'), ('peak', 'NN'), ('reached', 'VBN'), ('as', 'IN'), ('far', 'RB'), ('north', 'JJ'), ('as', 'IN'), ('vienna', 'NN'), (',', ','), ('austria', 'RB'), (',', ','), ('as', 'IN'), ('far', 'RB'), ('east', 'JJ'), ('as', 'IN'), ('the', 'DT'), ('persian', 'JJ'), ('gulf', 'NN'), (',', ','), ('as', 'IN'), ('far', 'RB'), ('west', 'JJ'), ('as', 'IN'), ('algeria', 'NNS'), (',', ','), ('and', 'CC'), ('as', 'IN'), ('far', 'RB'), ('south', 'JJ'), ('as', 'IN'), ('yemen', 'NNS'), ('.', '.'), ('the', 'DT'), ('city', 'NN'), ('named', 'VBN'), ('for', 'IN'), ('

In [29]:
def get_pos(tag):    
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

wnl = WordNetLemmatizer()
lemmatized_sentence = [wnl.lemmatize(word, get_pos(pos)) for word, pos in tagged_paragraph]
for word in lemmatized_sentence:
    print(f"'{word}'", end=", ")

'know', 'a', 'one', 'of', 'history', '’', 's', 'most', 'powerful', 'empire', ',', 'the', 'ottoman', 'empire', 'grow', 'from', 'a', 'turkish', 'stronghold', 'in', 'anatolia', 'into', 'a', 'vast', 'state', 'that', 'at', 'it', 'peak', 'reach', 'a', 'far', 'north', 'a', 'vienna', ',', 'austria', ',', 'a', 'far', 'east', 'a', 'the', 'persian', 'gulf', ',', 'a', 'far', 'west', 'a', 'algeria', ',', 'and', 'a', 'far', 'south', 'a', 'yemen', '.', 'the', 'city', 'name', 'for', 'constantine', ',', 'the', 'first', 'christian', 'emperor', 'of', 'rome', ',', 'then', 'also', 'become', 'know', 'a', 'istanbul', '(', 'a', 'version', 'of', 'stin', 'polis', ',', 'greek', 'for', '“', 'in', 'the', 'city', '”', 'or', '“', 'to', 'the', 'city.', '”', '.', 

### Application

In [32]:
sentences = []
labels = []
wnl = WordNetLemmatizer()
for row in data.iterrows():
    label = row[1]['label']
    text = row[1]['text']
    text = " ".join(text)
    text = nltk.word_tokenize(text)
    text = nltk.pos_tag(text)
    text = [wnl.lemmatize(word, get_pos(pos)) for word, pos in text]
    sentences.append(text)
    labels.append(label)

data = pd.DataFrame({"label":labels, "text":sentences})

In [34]:
data.head()

Unnamed: 0,label,text
0,Ottoman,"[Known, a, one, of, history, ’, s, most, power..."
1,Ottoman,"[Known, a, one, of, history, ’, s, most, power..."
2,Ottoman,"[Known, a, one, of, history, ’, s, most, power..."
3,Ottoman,"[Known, a, one, of, history, ’, s, most, power..."
4,Ottoman,"[Known, a, one, of, history, ’, s, most, power..."


## Label Encoding

In [35]:

labels = []
for label in data['label']:
    labels.append(empires.index(label))
data['label'] = labels

In [36]:
data.head()

Unnamed: 0,label,text
0,1,"[Known, a, one, of, history, ’, s, most, power..."
1,1,"[Known, a, one, of, history, ’, s, most, power..."
2,1,"[Known, a, one, of, history, ’, s, most, power..."
3,1,"[Known, a, one, of, history, ’, s, most, power..."
4,1,"[Known, a, one, of, history, ’, s, most, power..."


## Extra step: Equalizing the number of samples
Data with varied distribution will perform worse unless accounted for

In [37]:
label_counts = Counter(data["label"])
max_len = min(label_counts.values())
data = data.sample(frac=1).groupby('label').head(max_len)
labels = data['label']
text = data['text']
value_counts = data['label'].value_counts()
value_counts

0    877032
1    877032
2    877032
3    877032
Name: label, dtype: int64

## Text Vectorization

In [38]:
vectorizer = CountVectorizer()
text = []
for sentence in data['text']:
    text.append(" ".join(sentence))
vectorizer.fit(text)
text = vectorizer.transform(text)

In [39]:
vectorized_data = pd.DataFrame({"label":labels, "text":text})
vectorized_data.head()

KeyboardInterrupt: 

## Training the models

In [40]:
uniques = set()
for sentence in data['text']:
    for word in sentence:
        uniques.add(word)
num_uniques = len(uniques)

In [41]:
x_train, x_test, y_train, y_test = train_test_split(text, labels, test_size=.1, random_state=0)

#### Decision Tree

In [42]:
decision_tree = DecisionTreeClassifier(random_state=2, max_depth=150, max_features=1000)
decision_tree.fit(x_train, y_train)
decision_tree_score = decision_tree.score(x_test, y_test)
decision_tree_score

0.9993956894413834

#### Random Forest

In [None]:
random_forest = RandomForestClassifier(random_state=0, n_estimators=300, max_depth=150, max_features=1000)
random_forest.fit(x_train, y_train)
random_forest_score = random_forest.score(x_test, y_test)
random_forest_score

0.8235294117647058

#### Support Vector Machine

In [43]:
best_c = 0.1
best_kernel = "linear"
support_vector_machine = SupportVectorClassifier(kernel=best_kernel, C=best_c)
support_vector_machine.fit(x_train, y_train)
support_vector_machine_score = support_vector_machine.score(x_test, y_test)
support_vector_machine_score

### Long Short Term Memory

Sidenote: while one could convert x_Train from SparseTensor to array it causes the lstm model to have low accuracy and take 20 min per epoch, so its better to just format the data with tensorflow methods to convert it to a format that tensorflow is better equipped to handle.

In [None]:
text = data['text']
labels = data['label']
longest_sentence = max(text, key=lambda x: len(x))
encoded_sentences = [one_hot(" ".join(sentence), num_uniques) for sentence in text]
padded_sequences = pad_sequences(encoded_sentences, maxlen=len(longest_sentence), padding='post')

def encode_labels(labels):
    output = []
    for label in labels:
        label_array = np.zeros(len(empires))
        label_array[label] = 1    
        output.append(label_array)
    return output

labels = encode_labels(labels)
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=.1, random_state=85)
print("input example:", x_train[0])
print("label example:", y_train[0])

input example: [8438 6972 8143 2069 7777 9294 5225  427 3924  555 2612 7586 3226 9440
 2612 9395 2686 6797 5225 8126 3924 1931  665 2069 6344 3226 7690    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0]
label example: [0. 1. 0. 0.]


In [None]:
lstm = Sequential()
lstm.add(Embedding(num_uniques, 32))
lstm.add(Bidirectional(LSTM(100)))
lstm.add(Dense(256, activation="relu"))
lstm.add(Dropout(0.5))
lstm.add(Dense(128))
lstm.add(Dense(4, activation="sigmoid"))
optimizer = Adam(learning_rate=0.03)
lstm.compile(optimizer="adam", loss='categorical_crossentropy', metrics=["accuracy"])
lstm.summary()


Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_29 (Embedding)    (None, None, 32)          316192    
                                                                 
 bidirectional_29 (Bidirecti  (None, 200)              106400    
 onal)                                                           
                                                                 
 dense_81 (Dense)            (None, 256)               51456     
                                                                 
 dropout_26 (Dropout)        (None, 256)               0         
                                                                 
 dense_82 (Dense)            (None, 128)               32896     
                                                                 
 dense_83 (Dense)            (None, 4)                 516       
                                                     

In [None]:
lstm.fit(np.array(padded_sequences), np.array(labels), epochs=5, verbose=1, batch_size=128, callbacks=[])
lstm_score = lstm.evaluate(x_test, np.array(y_test), verbose=1)[1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Ensemble learning - Stacking
Stacking is a technique that allows you to use the predictions of several machine learning models/algorithms by training a neural network to combine the ouputs of 