In [1]:
import pandas as pd
import re
from collections import Counter

In [6]:
train = pd.read_csv('./data/train.csv')

## Method -1 
### Feature Extraction with Frequencies


$$
y_i = \sigma\left( w_1 \cdot \left( \sum_{j=1}^{k}  p_j \right) + w_2 \cdot \left( \sum_{j=1}^{k} n_j \right) + b \right)
$$

$$ y_i \text{ is the i-th target} $$
$$ w_1 \text{ is the weight for positive class} $$
$$ w_2 \text{ is the weight for negative class} $$
$$ b \text{ is the bias term} $$
$$ p_j \text{ is the positive count for the j-th word in the sample} $$
$$ n_j \text{ is the negative count for the j-th word in the sample} $$



In [9]:
def process_text(text):
    
    delimit_string = '!@#$%^&*()-_+={}:;[]\\|\"\'<>.,?/~`'
    pattern = f"[{re.escape(delimit_string)}\\s]+"
    split_text = re.split(pattern, text)
    split_text = [word.lower() for word in split_text if not re.search(r'\d', word)]
    return split_text

In [14]:
vocabulary = set()

for i in range(len(train)):
    word_list = process_text(train.text[i])
    if word_list:
        vocabulary.update(word_list)

pos = []
neg = []
for index in train.index:
    text = train.loc[index,'text']
    if train.loc[index,'target'] == 1:
        pos.extend(process_text(text))
    else:
        neg.extend(process_text(text))
        
count_pos = Counter(pos)
count_neg = Counter(neg)

vocabulary_counts = {key:{'pos':count_pos[key],'neg':count_neg[key]} for key in vocabulary}