In [1]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Jack
[nltk_data]     D\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
test = pd.read_csv("data/Corona_NLP_test.csv",encoding='latin1')
train = pd.read_csv("data/Corona_NLP_train.csv",encoding='latin1')
# We can unmerge later, more important to process all data at once. too lazy to duplicate all the code for the test set too
train = train.append(test)

## Text preprocessing
Remove non-letter characters

In [3]:
def sub(text):
    # removes strangely formatted apostrophes
    temp = re.sub("â\x92", '', text )
    temp = re.sub('[^a-zA-Z]', ' ', temp )
    temp = re.sub(r'\s+', ' ', temp)
    return temp

train['LowerTweet'] = train['LowerTweet'].apply(sub)
train

Unnamed: 0,OriginalTweet,Sentiment,LowerTweet,BooleanSentiment
0,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...,1
1,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...,1
2,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...,1
3,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the covid...,0
4,As news of the regionÂs first confirmed COVID...,Positive,as news of the regions first confirmed covid c...,1
...,...,...,...,...
3174,"@RicePolitics @MDCounties Craig, will you call...",Negative,ricepolitics mdcounties craig will you call o...,0
3175,Meanwhile In A Supermarket in Israel -- People...,Positive,meanwhile in a supermarket in israel people da...,1
3176,Did you panic buy a lot of non-perishable item...,Negative,did you panic buy a lot of non perishable item...,0
3177,Gov need to do somethings instead of biar je r...,Extremely Negative,gov need to do somethings instead of biar je r...,0


## Tokenization and Stemming

Following [this tutorial](https://medium.com/swlh/sentiment-classification-using-word-embeddings-word2vec-aedf28fbb8ca) we can do some additional manipulation such as stemming and tokenization to create an appropriate word vectors

In [4]:
from gensim.parsing.porter import PorterStemmer
from gensim.utils import simple_preprocess

# Tokenize the text column to get the new column 'tokenized_text'
train['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in train['LowerTweet']] 
print(train['tokenized_text'].head(10))

porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
train['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in train['tokenized_text'] ]
train['stemmed_tokens'].head(10)

0    [advice, talk, to, your, neighbours, family, t...
1    [coronavirus, australia, woolworths, to, give,...
2    [my, food, stock, is, not, the, only, one, whi...
3    [me, ready, to, go, at, supermarket, during, t...
4    [as, news, of, the, regions, first, confirmed,...
5    [cashier, at, grocery, store, was, sharing, hi...
6    [due, to, covid, our, retail, store, and, clas...
7    [for, corona, prevention, we, should, stop, to...
8    [due, to, the, covid, situation, we, have, inc...
9    [horningsea, is, caring, community, lets, all,...
Name: tokenized_text, dtype: object


0    [advic, talk, to, your, neighbour, famili, to,...
1    [coronaviru, australia, woolworth, to, give, e...
2    [my, food, stock, is, not, the, onli, on, whic...
3    [me, readi, to, go, at, supermarket, dure, the...
4    [as, new, of, the, region, first, confirm, cov...
5    [cashier, at, groceri, store, wa, share, hi, i...
6    [due, to, covid, our, retail, store, and, clas...
7    [for, corona, prevent, we, should, stop, to, b...
8    [due, to, the, covid, situat, we, have, increa...
9    [horningsea, is, care, commun, let, all, look,...
Name: stemmed_tokens, dtype: object

In [5]:
stemmed_tokens = pd.Series(train['stemmed_tokens']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens)#,  vector_size=4000)

In [6]:
vocabulary = w2v_model.wv.key_to_index
len(vocabulary)

7774

In [7]:
def vectorize(x):
    global vocabulary
    vector = []
    for word in x:
        if word in vocabulary:
            vector.append(vocabulary[word])
    return vector

train['vector'] = train['stemmed_tokens'].apply(vectorize)

train['NumberOfWords'] = train['vector'].apply(lambda x : len(x))

In [8]:
train

Unnamed: 0,OriginalTweet,Sentiment,LowerTweet,BooleanSentiment,tokenized_text,stemmed_tokens,vector,NumberOfWords
0,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...,1,"[advice, talk, to, your, neighbours, family, t...","[advic, talk, to, your, neighbour, famili, to,...","[593, 452, 1, 32, 1470, 184, 1, 2061, 805, 388...",37
1,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...,1,"[coronavirus, australia, woolworths, to, give,...","[coronaviru, australia, woolworth, to, give, e...","[8, 674, 1499, 1, 220, 315, 962, 1324, 33, 209...",16
2,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...,1,"[my, food, stock, is, not, the, only, one, whi...","[my, food, stock, is, not, the, onli, on, whic...","[38, 16, 60, 10, 34, 0, 118, 12, 203, 10, 183,...",42
3,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the covid...,0,"[me, ready, to, go, at, supermarket, during, t...","[me, readi, to, go, at, supermarket, dure, the...","[88, 688, 1, 40, 19, 21, 66, 0, 3, 163, 34, 10...",36
4,As news of the regionÂs first confirmed COVID...,Positive,as news of the regions first confirmed covid c...,1,"[as, news, of, the, regions, first, confirmed,...","[as, new, of, the, region, first, confirm, cov...","[27, 80, 4, 0, 1357, 210, 864, 3, 251, 862, 45...",37
...,...,...,...,...,...,...,...,...
3174,"@RicePolitics @MDCounties Craig, will you call...",Negative,ricepolitics mdcounties craig will you call o...,0,"[ricepolitics, mdcounties, craig, will, you, c...","[ricepolit, mdcounti, craig, will, you, call, ...","[6046, 36, 13, 178, 12, 0, 508, 7424, 1, 4070,...",26
3175,Meanwhile In A Supermarket in Israel -- People...,Positive,meanwhile in a supermarket in israel people da...,1,"[meanwhile, in, supermarket, in, israel, peopl...","[meanwhil, in, supermarket, in, israel, peopl,...","[1343, 7, 21, 7, 3419, 28, 3432, 2, 2866, 394,...",16
3176,Did you panic buy a lot of non-perishable item...,Negative,did you panic buy a lot of non perishable item...,0,"[did, you, panic, buy, lot, of, non, perishabl...","[did, you, panic, bui, lot, of, non, perish, i...","[341, 13, 61, 53, 267, 4, 517, 1471, 176, 6220...",35
3177,Gov need to do somethings instead of biar je r...,Extremely Negative,gov need to do somethings instead of biar je r...,0,"[gov, need, to, do, somethings, instead, of, b...","[gov, need, to, do, someth, instead, of, biar,...","[771, 50, 1, 56, 433, 534, 4, 3795, 2553, 148,...",21


In [9]:
# Uncomment to save data
# train.to_csv('data/processed_data.csv')

## Generate custom example
Just for fun :P

In [18]:
tweet = "Covid is not great I wish it was good"
tweet = tweet.lower()
vector = []
for word in tweet.split(' '):
    if word in vocabulary:
        vector.append(vocabulary[word])
print(vector)

[3, 10, 34, 248, 881, 14, 107]
