## Imports

In [7]:
import numpy as np

## Converting the text to vectors
First we will convert each text to feature vectors. Specifically we will use a feature vector $X$ where $X_{ij}$ corresponds to the number of occurence of the $j'th$ word (based on our word mapping) in the $i'th$ text

For the word mapping, we will first use the top 10,000 words in the english dictionary then add additional words as we scan the texts up to a certain limit. 

### Parse initial words
First a list of stop words we're parsed. This stop words will be removed from consideration

In [8]:
data_train_len = 100
data_test_len  = 100

max_words_to_consider: int = 5
max_word_len:       int = 15
initial_words_path: str = "dataset\google-10000-english.txt"
stop_words_path   : str = "dataset\stop_words_english.txt"

def parse_initial_words(destination: dict, path: str, add_undefined_token: bool=False, filter_by: dict = {}, limit_len: int = -1) -> None:
    if add_undefined_token:
        destination["$$$"] = 0
    
    print("Parsing words")
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            word: str = line.strip()
            if (limit_len > -1 and len(destination) == limit_len):
                continue
            
            if (len(word) <= 1 or len(word) >= max_word_len):
                continue
            
            if (word in filter_by):
                continue

            destination[word] = len(destination)

    print("parsing succesfull")

words_to_consider: dict = {}
stop_words       : dict = {}

parse_initial_words(stop_words, stop_words_path, False)
#parse_initial_words(words_to_consider, initial_words_path, True, stop_words, max_words_to_consider)

print(len(words_to_consider))

Parsing words
parsing succesfull
0


  initial_words_path: str = "dataset\google-10000-english.txt"
  stop_words_path   : str = "dataset\stop_words_english.txt"


### Initialize all the pairs
Initialize the working pairs. Pairs are denoted as [first, second] where first = 1 and second = -1

In [9]:
import numpy as np

train_path: str = "train/"
test_path:  str = "test/"
c: str = ["anger", "fear", "joy", "sadness"]
total_class = len(c)
total_pairs: int = int((total_class * (total_class - 1))/2)
pairs: np.int8 = np.zeros(shape=(total_pairs, 2), dtype=np.int8)

at: int = 0
for i in range(total_class - 1):
    for j in range(i + 1, total_class):
        pairs[at, 0] = i
        pairs[at, 1] = j
        at += 1

### Parse training and test data to vectors
Read content of each training data per classification line by line then convert them to tensor.
Incrase the length of words_to_consider as we read up to a certain limit

In [10]:
import re

v_train: np.int32 = np.zeros(shape=(total_class, data_train_len, max_words_to_consider))
v_test:  np.int32 = np.zeros(shape=(total_class, data_test_len, max_words_to_consider))

def remove_trailing_special_characters(word):
    return re.sub(r'[^a-zA-Z]+$', '', word)

# Checks text word by word. If a word not in word_to_consider then we add it
# unless we already exceeded the max words to consider, or the word is not a
# bad candidate. We then update the occurence of the word in its corresponding matrix
def process_text(destination: np.int32, line: str, i: int, line_idx: int, add_new:bool=True):
    D: int= len(words_to_consider)

    for word in line.split():
        word: str = remove_trailing_special_characters(word)

        if word not in words_to_consider:
            if len(word) <= 1 or len(word) >= max_word_len:
                continue

            if D >= max_words_to_consider:
                continue

            if not add_new:
                continue
             
            words_to_consider[word] = D
            D += 1

        destination[i, line_idx, words_to_consider[word]] += 1

def conv_data_to_vectors(destination: np.int32, source_path: str, data_len: np.int32, name: str, add_new: bool=True):
    print(f'Converting {name} data to vectors...')
    for i in range(total_class):
        data_path: str = source_path + c[i] + ".txt"
        
        with open(data_path, 'r', encoding='utf-8') as file:
            line_idx: int = 0
            for line in file:
                if (line_idx == data_len):
                    break
                
                process_text(destination, line, i, line_idx, add_new)
                line_idx += 1

conv_data_to_vectors(v_train, train_path, data_train_len, "train")
conv_data_to_vectors(v_test, test_path, data_test_len, "test", add_new=False)

print("Conversion complete...")
print("Total words to consider: ", len(words_to_consider))
print(words_to_consider)

Converting train data to vectors...
Converting test data to vectors...
Conversion complete...
Total words to consider:  5
{'feel': 0, 'like': 1, 'jerk': 2, 'because': 3, 'the': 4}


### Train each pair
Train each pair using the modified SMO class

In [11]:
%run SMO.py

In [12]:
class_pairs: list = []


# A mapping technique such that we can store our class_pairs in a 1 dimensional array. 
def map_to_pos(size: int, i: int, j: int):
    return ((2*size - i - 3)*i + 2*(j - 1))/2

target: np.float64 = np.concatenate((np.full(shape=data_train_len, fill_value=1), np.full(shape=data_train_len, fill_value=-1)), axis=0)

# Loop through all the pairs and create a class
for i in range(total_class):
    for j in range(i + 1, total_class):
        class_pairs.append(SMO_GAUSSIAN(np.concatenate((v_train[0], v_train[1]), axis=0),
                                         target, data_train_len*2, max_words_to_consider, c=.5, log=False))

# call the class from each pair 
for i in range(total_pairs):
    print("computing for pair: ", i)
    class_pairs[i].smo_train()

# store the resulting alpha and beta of each class
alpha_res: np.float64 = np.zeros(shape=(total_pairs, data_test_len*2), dtype=np.float64)
beta_res : np.float64 = np.zeros(shape=(total_pairs), dtype=np.float64)

for i in range(total_pairs):
    alpha_res[i] = class_pairs[i].alphs
    beta_res[i]  = class_pairs[i].B

print(alpha_res)
print(beta_res)

# save result


total iterations:  0
total iterations:  2
total iterations:  4
total iterations:  6
total iterations:  8
total iterations:  10
total iterations:  12
total iterations:  14
total iterations:  16
total iterations:  18
total iterations:  20
total iterations:  22
total iterations:  24
total iterations:  26
total iterations:  28
total iterations:  30
total iterations:  32
total iterations:  34
total iterations:  36
total iterations:  38
total iterations:  40
total iterations:  42
total iterations:  44
total iterations:  46
total iterations:  48
total iterations:  50
total iterations:  52
total iterations:  54
total iterations:  56
total iterations:  58
total iterations:  60
total iterations:  62
total iterations:  64
total iterations:  66
total iterations:  68
total iterations:  70
total iterations:  72
total iterations:  74
total iterations:  76
total iterations:  78
total iterations:  80
total iterations:  82
total iterations:  84
total iterations:  86
total iterations:  88
total iteration

TypeError: array() got an unexpected keyword argument 'shape'

### Save the progress
Save the alphas and betas to a file

### Test each resulting Pair
Given the alphas and betas/treshold of each pair, we will then test each pair on their corresponding test data

In [None]:
from numba import jit, prange

# uses jit and parallelization for faster computation of predict
@jit(nopython=True, parallel=True)
def predict(alphs, point, B, sigma, x, y):
    correct = 0

    for i in prange(len(x)):
        norms = np.zeros(len(point))
        for j in range(len(point)):
            norm = 0.0
            for k in range(len(point[j])):
                diff = point[j, k] - x[i, k]
                norm += diff * diff
            norms[j] = norm
        fx = (alphs * y) @ np.exp(-(norms) / (2 * sigma**2)) + B
        if y[i] == 1:
            correct += fx >= 0
        else:
            correct += fx < 0
    
    return correct / len(x)

In [None]:
"""train_res = smo.accuracy()
test_y: np.float64 = np.concatenate((np.full(shape=data_test_len, fill_value=1), np.full(shape=data_test_len, fill_value=-1)), axis=0)
test_res = predict(smo.alphs, smo.point, smo.B, smo.sigma, np.concatenate((v_test[0], v_test[1]), axis=0), test_y)"""

In [None]:
"""from matplotlib import pyplot as plt

plt.figure(figsize=(8, 6))
plt.bar(["test", "train"], [test_res, train_res], color='skyblue')

plt.xlabel('data')
plt.ylabel('accuracy')
plt.title(f'test vs train accuracy | {data_train_len} vs {data_test_len}')

plt.legend()
plt.show()"""

In [None]:
"""from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

np.random.seed(200)

n = 200
M = n*2     # Overwrites the above M to make this wor
D = 2       # Ensures dimensions are 2

def gen_circle(n, center_x=0, center_y=0, radius=1, label=0):
    alpha = 2 * np.pi * np.random.rand(n)
    r = radius * np.sqrt(np.random.rand(n))
    x = r * np.cos(alpha) + center_x
    y = r * np.sin(alpha) + center_y
    label = np.ones(n) * label
    return [x, y, label]

def predict(model, x):
        res = np.zeros(len(x), dtype=np.float64)

        for i in range(len(x)):
            fx = (model.alphs * model.target) @ np.exp(-(np.linalg.norm(model.point - x[i], 2, axis=1) ** 2) 
                                                       / (2 * model.sigma ** 2)) + model.B
            res[i] = fx
        
        return res

C0 = gen_circle(n, center_x=1, center_y=0, radius=.8, label=1)
C1 = gen_circle(n, center_x=-1, center_y=0, radius=.8, label=-1)

x0 = np.append(C0[0], C1[0])
x1 = np.append(C0[1], C1[1])

X = np.c_[x0, x1]
Y = np.append(C0[2], C1[2])

scaler = StandardScaler()
train_x = scaler.fit_transform(X)

# Main function
point = train_x
target = Y.astype(np.int32)
model = SMO_GAUSSIAN(train_x, target, M, 2, c=1, log=True)
model.smo_train()

# Prediction in vector
train_y = predict(model, point)
#print('support vector: {} / {}'\
    #.format(len(model['alphs'][model['alphs'] > 1e-5]), len(model['alphs'])))

# Gathers the support vectors (non-zero alpha)
sv_threshold = 0
sv_idx = []
for idx, alpha in enumerate(model.alphs):
    if alpha > sv_threshold:
        #print('index = {}, alpha = {:.3f}, predict y={:.3f}'\
            #.format(idx, alpha, train_y[idx]))
        
        sv_idx.append(idx)

# Threshold
print(f'bias = {model.B}')
# Error rate
train_y_sign = np.sign(train_y)
error_rate = np.mean(train_y_sign != target)
print('training data error rate = {:.2f}'.format(error_rate))

# Draw the Plot
plt.plot(C0[0], C0[1], 'o', markerfacecolor='r', markeredgecolor='None', alpha=0.55)
plt.plot(C1[0], C1[1], 'o', markerfacecolor='b', markeredgecolor='None', alpha=0.55)

resolution = 50
dx = np.linspace(X[:, 0].min(), X[:, 0].max(), resolution)
dy = np.linspace(X[:, 1].min(), X[:, 1].max(), resolution)
dx, dy = np.meshgrid(dx, dy)
plot_x = np.c_[dx.flatten(), dy.flatten()]

transformed_plot_x = scaler.transform(plot_x)
dz = predict(model, transformed_plot_x)
dz = dz.reshape(dx.shape)

plt.contour(dx, dy, dz, alpha=1, colors=('b', 'k', 'r'), \
            levels=(-1, 0, 1), linestyles = ('--', '-', '--'))

label_cnt = 0
for i in sv_idx:
    if label_cnt == 0:
        plt.scatter(X[i, 0], X[i, 1], marker='*', color='k', \
                    s=120, label='Support vector')
        label_cnt += 1
        continue

    plt.scatter(X[i, 0], X[i, 1], marker='*', color='k', s=120)

plt.legend()
plt.show()"""