# Preprocessing Section
---

In [1]:
import hazm
import numpy as np
import pickle as pkl

### Reading CSV Data as Dictionary

In [2]:
import csv

data = []
with open("nlp_train.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        row.pop("")
        data.append(row)

### Normalization Process

In [3]:
normalizer = hazm.Normalizer()
for row in data:
    text = row["Text"]
    text = normalizer.remove_specials_chars(text)
    text = normalizer.remove_diacritics(text)
    text = normalizer.decrease_repeated_chars(text)
    text = normalizer.seperate_mi(text)
    row["Text"] = normalizer.normalize(text)


### Stemming and Tokenizing
Excluding __Stop Words__, __Punctuations__ and unifying __Digits__ in the process

In [4]:
stop_words = set()
with open("stop_words.txt", "r") as f:
    for word in f.readlines():
        stop_words.add(word.strip())
pkl.dump(stop_words, open("stop_words.pkl", "wb"))

stemmer = hazm.Stemmer()
tokenized_data = []
for row in data:
    tokens = hazm.word_tokenize(row["Text"])
    occurance_dict = {}
    for token in tokens:
        token = stemmer.stem(token)
        if (len(token) == 0) or (token in r"...[]\\;:,،()\?!{}<>#$\*-_") or (token in stop_words):
            continue
        if token[0].isdigit() or token[0] in "۱۲۳۴۵۶۷۸۹۰":
            token = r"%d"
        if token not in occurance_dict:
            occurance_dict[token] = 0
        occurance_dict[token] += 1
    tokenized_data.append({"Tokens": occurance_dict, "Category": row["Category"]})

### Extracting Features
Collecting unique tokens from documents, which have occured in the document at least two times

In [9]:
features = set()
for row in tokenized_data:
    for token in row["Tokens"]:
        if row["Tokens"][token] >= 2:
            features.add(token)
features = list(features)

In [11]:
def vectorize(sentence):
    tokens = sentence["Tokens"]
    category = sentence["Category"]
    vector = []
    for word in features:
        if word in tokens:
            vector.append(min(tokens[word], 127))
        else:
            vector.append(0)
    cat_value = 0 if category == "Sport" else 1
    vector = np.array(vector, dtype=np.int8)
    return vector, cat_value

### Creating Document-Term Matrix and Category Vector
Document-Term Matrix (dtm) is a matrix where each row is a document and each column is a feature.

Each cell is the number of occurance of a feature in a document.

In [12]:
dtm = []
cat_vector = []
for row in tokenized_data:
    vector, cat_value = vectorize(row)
    dtm.append(vector)
    cat_vector.append(cat_value)
dtm = np.array(dtm)
cat_vector = np.array(cat_vector)

### Saving Document-Term Matrix, Categories Vector and Features
Suitable for later use

In [13]:
np.save("document-term_mat.npy", dtm)
np.save("category_vector.npy", cat_vector)
pkl.dump(features, open("features.pkl", "wb"))

### Fitting a PCA to the Document-Term Matrix

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=200, copy=False)
pca.fit(dtm)

### Saving the PCA

In [15]:
pkl.dump(pca, open("pca.pkl", "wb"))

# Training Section
---
Using Logistc Regression Classifier 

### Loading Document-Term Matrix and PCA

In [29]:
import numpy as np
import pickle as pkl

dtm = np.load("document-term_mat.npy")
cat_vector = np.load("category_vector.npy")
pca = pkl.load(open("pca.pkl", "rb"))
features = pkl.load(open("features.pkl", "rb"))

### Shuffling Document-Term Matrix and Category Vector

In [16]:
perm = np.random.permutation(len(dtm))
dtm = dtm[perm]
cat_vector = cat_vector[perm]

#### Some parameters

In [17]:
n_train = int(len(dtm) * 0.8)
n_test = int(len(dtm) * 0.2)
n_features = pca.n_components_

### Logistic Regression Functions

In [18]:
def sigmoid(x):
    return 1 / (np.exp(-x) + 1)

In [19]:
def f(x, w, b):
    return sigmoid(np.dot(x, w) + b)

In [20]:
def gradient(w, b): 
    dj_dw = np.zeros(w.shape)
    dj_db = 0

    for i in range(n_train):
        x_i = pca.transform([dtm[i]])[0]
        y_i = cat_vector[i]
        z = f(x_i, w, b)
        delta  = z - y_i                       
        for j in range(n_features):
            dj_dw[j] += delta * x_i[j]      
        dj_db += delta
    dj_dw /= n_train                                
    dj_db /= n_train                           

    return dj_db, dj_dw

In [43]:
def gradient_descent(alpha, epoch, load=False): 
    
    if load:
        w = np.load("model_weights.npy")
        b = np.load("model_bias.npy")[0]
    else:
        w = np.zeros(n_features)
        b = 0
    for _ in range(epoch):
        dj_db, dj_dw = gradient(w, b)   

        w = w - alpha * dj_dw               
        b = b - alpha * dj_db
        
    return w, b

### Training the Model
First parameter is the learning rate and the second is the number of iterations

In [47]:
w, b = gradient_descent(alpha=0.2, epoch=10, load=True)

# Testing
---

In [None]:
passed = 0
for i in range(n_test):
    test = pca.transform([dtm[n_train + i]])[0]
    category = cat_vector[n_train + i]
    result = f(test, w, b)
    if int(2 * result) == category:
        passed += 1

print("Accuracy:", 100 * passed / n_test)

### Saving the model

In [49]:
np.save("model_weights.npy", w)
np.save("model_bias.npy", np.array([b]))