In [105]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import string

#nltk.download('wordnet')
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
# from nltk.tokenize import TweetTokenizer

In [6]:
train = pd.read_csv('./data/train.csv')

## Method -1 
### Feature Extraction with Frequencies


$$
y_i = \sigma\left( w_1 \cdot \left( \sum_{j=1}^{k}  p_j \right) + w_2 \cdot \left( \sum_{j=1}^{k} n_j \right) + b \right)
$$

$$ y_i \text{ is the i-th target} $$
$$ w_1 \text{ is the weight for positive class} $$
$$ w_2 \text{ is the weight for negative class} $$
$$ b \text{ is the bias term} $$
$$ p_j \text{ is the positive count for the j-th word in the sample} $$
$$ n_j \text{ is the negative count for the j-th word in the sample} $$

* Feature vector is of size 3 : [W1, W2, B]



#### Preprocessing tweets

1. Eliminate @mentions (any word beginning with '@')
2. Hashtags have special meaning, keep it as is (Usually more indicative of subject)
3. Remove punctuations and other special characters
4. Remove stop words

In [117]:
def process_tweet(tweet):
    
    
    lemmatizer = WordNetLemmatizer()
    stopwords_english = stopwords.words('english')
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove mentions
    tweet = re.sub(r'@\w+', '', tweet)

    tweet = re.sub(f"[^ {string.ascii_letters}]", ' ', tweet)
    tweet = re.sub(f"[{string.whitespace}]",' ',tweet)
    
    tokenizer = WhitespaceTokenizer()
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if word not in stopwords_english:  # remove punctuation
            # tweets_clean.append(word)
            stem_word = lemmatizer.lemmatize(word.lower())  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean
    

In [118]:
vocabulary = set()

for i in range(len(train)):
    word_list = process_tweet(train.text[i])
    if word_list:
        vocabulary.update(word_list)

pos = []
neg = []
for index in train.index:
    text = train.loc[index,'text']
    if train.loc[index,'target'] == 1:
        pos.extend(process_tweet(text))
    else:
        neg.extend(process_tweet(text))
        
count_pos = Counter(pos)
count_neg = Counter(neg)

vocabulary_counts = {key:{'pos':count_pos[key],'neg':count_neg[key]} for key in vocabulary}

In [119]:
vocabulary_counts

{'claire': {'pos': 1, 'neg': 0},
 'seeweed': {'pos': 1, 'neg': 0},
 'lgl': {'pos': 3, 'neg': 1},
 'garza': {'pos': 0, 'neg': 1},
 'sustainable': {'pos': 0, 'neg': 1},
 'shtap': {'pos': 0, 'neg': 1},
 'billiton': {'pos': 0, 'neg': 1},
 'monsoon': {'pos': 4, 'neg': 0},
 'skip': {'pos': 1, 'neg': 0},
 'qiang': {'pos': 1, 'neg': 0},
 'turbine': {'pos': 1, 'neg': 3},
 'duckvillelol': {'pos': 0, 'neg': 1},
 'safely': {'pos': 1, 'neg': 2},
 'bloody': {'pos': 7, 'neg': 38},
 'wraith': {'pos': 1, 'neg': 0},
 'petty': {'pos': 1, 'neg': 0},
 'qld': {'pos': 1, 'neg': 0},
 'poverty': {'pos': 2, 'neg': 3},
 'superstition': {'pos': 1, 'neg': 1},
 'treblinka': {'pos': 1, 'neg': 0},
 'alwx': {'pos': 1, 'neg': 0},
 'eternity': {'pos': 0, 'neg': 1},
 'attempted': {'pos': 0, 'neg': 1},
 'bp': {'pos': 3, 'neg': 1},
 'beaumont': {'pos': 1, 'neg': 0},
 'patched': {'pos': 0, 'neg': 3},
 'uk': {'pos': 10, 'neg': 8},
 'gigawatts': {'pos': 0, 'neg': 1},
 'cri': {'pos': 0, 'neg': 1},
 'hitchhiking': {'pos': 1, 'n