## Imports

In [None]:
import numpy as np

## Converting the text to vectors
First we will convert each text to feature vectors. Specifically we will use a feature vector $X$ where $X_{ij}$ corresponds to the number of occurence of the $j'th$ word (based on our word mapping) in the $i'th$ text

For the word mapping, we will first use the top 10,000 words in the english dictionary then add additional words as we scan the texts up to a certain limit. 

### Parse initial words
First a list of stop words we're parsed. This stop words will be removed from consideration

In [None]:
data_len_per_class = 500
max_words_to_consider: int = 500
max_word_len:       int = 15
initial_words_path: str = "dataset\google-10000-english.txt"
stop_words_path   : str = "dataset\stop_words_english.txt"


def parse_initial_words(destination: dict, path: str, add_undefined_token: bool=False, filter_by: dict = {}, limit_len: int = -1) -> None:
    if add_undefined_token:
        destination["$$$"] = 0
    
    print("Parsing words")
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            word: str = line.strip()
            if (limit_len > -1 and len(destination) == limit_len):
                continue
            
            if (len(word) <= 1 or len(word) >= max_word_len):
                continue
            
            if (word in filter_by):
                continue

            destination[word] = len(destination)

    print("parsing succesfull")

words_to_consider: dict = {}
stop_words       : dict = {}

parse_initial_words(stop_words, stop_words_path, False)
#parse_initial_words(words_to_consider, initial_words_path, True, stop_words, max_words_to_consider)

print(len(words_to_consider))

### Initialize all the pairs
Initialize the working pairs. Pairs are denoted as [first, second] where first = 1 and second = -1

In [None]:
import numpy as np

train_path: str = "train/"
c: str = ["anger", "fear", "joy", "sadness"]
total_class = len(c)
total_pairs: int = int((total_class * (total_class - 1))/2)
pairs: np.int8 = np.zeros(shape=(total_pairs, 2), dtype=np.int8)

at: int = 0
for i in range(total_class - 1):
    for j in range(i + 1, total_class):
        pairs[at, 0] = i
        pairs[at, 1] = j
        at += 1


### Parse all data to vectors
Read content of each training data per classification line by line then convert them to tensor.
Incrase the length of words_to_consider as we read up to a certain limit

In [None]:
import re

vectorized: np.int32 = np.zeros(shape=(total_class, data_len_per_class, max_words_to_consider))

def remove_trailing_special_characters(word):
    return re.sub(r'[^a-zA-Z]+$', '', word)

# Checks text word by word. If a word not in word_to_consider then we add it
# unless we already exceeded the max words to consider, or the word is not a
# bad candidate. We then update the occurence of the word in its corresponding matrix
def process_text(line: str, i: int, line_idx: int):
    D: int= len(words_to_consider)

    for word in line.split():
        word: str = remove_trailing_special_characters(word)

        if word not in words_to_consider:
            if word in words_to_consider:
                continue 
            
            if D >= max_words_to_consider:
                continue
            
            if len(word) <= 1 or len(word) >= max_word_len:
                continue
            
            words_to_consider[word] = D
            D += 1

        vectorized[i, line_idx, words_to_consider[word]] += 1
    
print("Converting training data to vectors...")
for i in range(total_class):
    data_path: str = train_path + c[i] + ".txt"
    
    with open(data_path, 'r', encoding='utf-8') as file:
        line_idx: int = 0
        for line in file:
            if (line_idx == data_len_per_class):
                break
            
            process_text(line, i, line_idx)
            line_idx += 1

print("Conversion complete...")
print("Size of vector", vectorized.shape)
print("Total words to consider: ", len(words_to_consider))

### Train each pair
Train each pair using the modified SMO class

In [None]:
%run SMO.py

In [None]:
"""smo = SMO_GAUSSIAN(np.concatenate((vectorized[0], vectorized[1]), axis=0), data_len_per_class * 2, max_words_to_consider, c=.5, log=True)
smo.smo_train()

print(smo.accuracy())"""

In [None]:
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

np.random.seed(200)

n = 3000
M = n*2     # Overwrites the above M to make this wor
D = 2       # Ensures dimensions are 2

def gen_circle(n, center_x=0, center_y=0, radius=1, label=0):
    alpha = 2 * np.pi * np.random.rand(n)
    r = radius * np.sqrt(np.random.rand(n))
    x = r * np.cos(alpha) + center_x
    y = r * np.sin(alpha) + center_y
    label = np.ones(n) * label
    return [x, y, label]

C0 = gen_circle(n, center_x=.7, center_y=0, radius=1, label=1)
C1 = gen_circle(n, center_x=-.7, center_y=0, radius=1, label=-1)

x0 = np.append(C0[0], C1[0])
x1 = np.append(C0[1], C1[1])

X = np.c_[x0, x1]
Y = np.append(C0[2], C1[2])

scaler = StandardScaler()
train_x = scaler.fit_transform(X)

# Main function
point = train_x
target = Y
model = SMO_GAUSSIAN(train_x, target, M, 2, c=1, log=True)
model.smo_train()

# Prediction in vector
train_y = model.predict(point, new=False)
#print('support vector: {} / {}'\
    #.format(len(model['alphs'][model['alphs'] > 1e-5]), len(model['alphs'])))

# Gathers the support vectors (non-zero alpha)
sv_threshold = 0
sv_idx = []
for idx, alpha in enumerate(model.alphs):
    if alpha > sv_threshold:
        #print('index = {}, alpha = {:.3f}, predict y={:.3f}'\
            #.format(idx, alpha, train_y[idx]))
        
        sv_idx.append(idx)

# Threshold
print(f'bias = {model.B}')
# Error rate
train_y_sign = np.sign(train_y)
error_rate = np.mean(train_y_sign != target)
print('training data error rate = {:.2f}'.format(error_rate))

# Draw the Plot
plt.plot(C0[0], C0[1], 'o', markerfacecolor='r', markeredgecolor='None', alpha=0.55)
plt.plot(C1[0], C1[1], 'o', markerfacecolor='b', markeredgecolor='None', alpha=0.55)

resolution = 50
dx = np.linspace(X[:, 0].min(), X[:, 0].max(), resolution)
dy = np.linspace(X[:, 1].min(), X[:, 1].max(), resolution)
dx, dy = np.meshgrid(dx, dy)
plot_x = np.c_[dx.flatten(), dy.flatten()]

transformed_plot_x = scaler.transform(plot_x)
dz = model.predict(transformed_plot_x, True)
dz = dz.reshape(dx.shape)

plt.contour(dx, dy, dz, alpha=1, colors=('b', 'k', 'r'), \
            levels=(-1, 0, 1), linestyles = ('--', '-', '--'))

label_cnt = 0
for i in sv_idx:
    if label_cnt == 0:
        plt.scatter(X[i, 0], X[i, 1], marker='*', color='k', \
                    s=120, label='Support vector')
        label_cnt += 1
        continue

    plt.scatter(X[i, 0], X[i, 1], marker='*', color='k', s=120)

plt.legend()
plt.show()