In [13]:
import pandas as pd
import requests, zipfile, io, re, nltk
from datetime import datetime
import tensorflow as tf
from keras import models, layers
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Importing Data

In [2]:
raw_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None,
                         names=['polarity', 'id', 'date', 'query', 'user', 'tweet'])
raw_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


### Removing Unnecessary Data

In [3]:
df = raw_df.drop(columns=['id', 'query', 'polarity', 'user', 'date'])
# df['datetime'] = raw_df['date'].apply(lambda x: pd.to_datetime(x.replace('PDT ', '')))
df

Unnamed: 0,tweet
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."
...,...
1599995,Just woke up. Having no school is the best fee...
1599996,TheWDB.com - Very cool to hear old Walt interv...
1599997,Are you ready for your MoJo Makeover? Ask me f...
1599998,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
y = raw_df['polarity']
print(f"Unique Elements of y: {pd.unique(y)}")
# Change y from [0, 4] to [0, 1]
y = y.apply(lambda x: 1 if x==4 else 0)
y

Unique Elements of y: [0 4]


0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: polarity, Length: 1600000, dtype: int64

# Data Preprocessing

In [36]:
processed_df = df.copy(deep=True)

# Remove URLs and User Mentions. All Twitter handles must be within 4 to 15 characters
processed_df['tweet'] = processed_df['tweet'].apply(lambda x: re.sub(r"http\S+|@\w{4,15}", "", x))
processed_df

Unnamed: 0,tweet
0,"- Awww, that's a bummer. You shoulda got Da..."
1,is upset that he can't update his Facebook by ...
2,I dived many times for the ball. Managed to s...
3,my whole body feels itchy and like its on fire
4,"no, it's not behaving at all. i'm mad. why am..."
...,...
1599995,Just woke up. Having no school is the best fee...
1599996,TheWDB.com - Very cool to hear old Walt interv...
1599997,Are you ready for your MoJo Makeover? Ask me f...
1599998,Happy 38th Birthday to my boo of alll time!!! ...


In [37]:
# Tokenize Tweets into Sentences and Extract Part-Of-Speech Tags
processed_df['sentence_tokens'] = processed_df['tweet'].apply(lambda x: sent_tokenize(x))
tokenizer = RegexpTokenizer(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)|\w+|[^\w\s]+")
processed_df['pos_tags'] = processed_df['sentence_tokens'].apply(lambda x: [nltk.pos_tag(tokenizer.tokenize(sent)) for sent in x])
processed_df

Unnamed: 0,tweet,sentence_tokens,pos_tags
0,"- Awww, that's a bummer. You shoulda got Da...","[ - Awww, that's a bummer., You shoulda got D...","[[(-, :), (Awww, NN), (,, ,), (that, IN), (', ..."
1,is upset that he can't update his Facebook by ...,[is upset that he can't update his Facebook by...,"[[(is, VBZ), (upset, JJ), (that, IN), (he, PRP..."
2,I dived many times for the ball. Managed to s...,"[ I dived many times for the ball., Managed to...","[[(I, PRP), (dived, VBD), (many, JJ), (times, ..."
3,my whole body feels itchy and like its on fire,[my whole body feels itchy and like its on fire],"[[(my, PRP$), (whole, JJ), (body, NN), (feels,..."
4,"no, it's not behaving at all. i'm mad. why am...","[ no, it's not behaving at all., i'm mad., why...","[[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB..."
...,...,...,...
1599995,Just woke up. Having no school is the best fee...,"[Just woke up., Having no school is the best f...","[[(Just, RB), (woke, VBD), (up, RP), (., .)], ..."
1599996,TheWDB.com - Very cool to hear old Walt interv...,[TheWDB.com - Very cool to hear old Walt inter...,"[[(TheWDB, NNP), (., .), (com, NN), (-, :), (V..."
1599997,Are you ready for your MoJo Makeover? Ask me f...,"[Are you ready for your MoJo Makeover?, Ask me...","[[(Are, NNP), (you, PRP), (ready, JJ), (for, I..."
1599998,Happy 38th Birthday to my boo of alll time!!! ...,[Happy 38th Birthday to my boo of alll time!!!...,"[[(Happy, JJ), (38th, CD), (Birthday, NN), (to..."


In [38]:
# pattern = regex.compile(r"(.)/\1{2,}")
# pattern.sub(r"\1\1\1", text)
# Tokenize Tweets into Words
processed_df['word_tokens'] = processed_df['tweet'].apply(lambda x: tokenizer.tokenize(x))
processed_df

Unnamed: 0,tweet,sentence_tokens,pos_tags,word_tokens
0,"- Awww, that's a bummer. You shoulda got Da...","[ - Awww, that's a bummer., You shoulda got D...","[[(-, :), (Awww, NN), (,, ,), (that, IN), (', ...","[-, Awww, ,, that, ', s, a, bummer, ., You, sh..."
1,is upset that he can't update his Facebook by ...,[is upset that he can't update his Facebook by...,"[[(is, VBZ), (upset, JJ), (that, IN), (he, PRP...","[is, upset, that, he, can, ', t, update, his, ..."
2,I dived many times for the ball. Managed to s...,"[ I dived many times for the ball., Managed to...","[[(I, PRP), (dived, VBD), (many, JJ), (times, ...","[I, dived, many, times, for, the, ball, ., Man..."
3,my whole body feels itchy and like its on fire,[my whole body feels itchy and like its on fire],"[[(my, PRP$), (whole, JJ), (body, NN), (feels,...","[my, whole, body, feels, itchy, and, like, its..."
4,"no, it's not behaving at all. i'm mad. why am...","[ no, it's not behaving at all., i'm mad., why...","[[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB...","[no, ,, it, ', s, not, behaving, at, all, ., i..."
...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,"[Just woke up., Having no school is the best f...","[[(Just, RB), (woke, VBD), (up, RP), (., .)], ...","[Just, woke, up, ., Having, no, school, is, th..."
1599996,TheWDB.com - Very cool to hear old Walt interv...,[TheWDB.com - Very cool to hear old Walt inter...,"[[(TheWDB, NNP), (., .), (com, NN), (-, :), (V...","[TheWDB, ., com, -, Very, cool, to, hear, old,..."
1599997,Are you ready for your MoJo Makeover? Ask me f...,"[Are you ready for your MoJo Makeover?, Ask me...","[[(Are, NNP), (you, PRP), (ready, JJ), (for, I...","[Are, you, ready, for, your, MoJo, Makeover, ?..."
1599998,Happy 38th Birthday to my boo of alll time!!! ...,[Happy 38th Birthday to my boo of alll time!!!...,"[[(Happy, JJ), (38th, CD), (Birthday, NN), (to...","[Happy, 38th, Birthday, to, my, boo, of, alll,..."


In [39]:
# Generate stop words
print("Stop Words: ", stopwords.words('english'))
stop_words = set(stopwords.words('english'))
# Remove stop words from tokenized tweets
processed_df['word_tokens_no_stop_words'] = processed_df['word_tokens'].apply(lambda x: [w for w in x if not w.lower() in stop_words])

Stop Words:  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 's

In [40]:
# Stemming words in Tweet
ps = PorterStemmer()
processed_df['word_tokens_no_stop_stemmed'] = processed_df['word_tokens_no_stop_words'].apply(lambda x: [ps.stem(w) for w in x])

In [82]:
processed_df['word_tokens_no_stop_words'][0]

['-',
 'Awww',
 ',',
 "'",
 'bummer',
 '.',
 'shoulda',
 'got',
 'David',
 'Carr',
 'Third',
 'Day',
 '.',
 ';']

In [105]:
# WordNet POS tags are: NOUN = 'n', ADJ = 's', VERB = 'v', ADV = 'r', ADJ_SAT = 'a'
# Descriptions (c) https://web.stanford.edu/~jurafsky/slp3/10.pdf
tag_map = {
        'CC':None, # coordin. conjunction (and, but, or)  
        'CD':wn.NOUN, # cardinal number (one, two)             
        'DT':None, # determiner (a, the)                    
        'EX':wn.ADV, # existential ‘there’ (there)           
        'FW':None, # foreign word (mea culpa)             
        'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
        'JJ':[wn.ADJ, wn.ADJ_SAT], # adjective (yellow)                  
        'JJR':[wn.ADJ, wn.ADJ_SAT], # adj., comparative (bigger)          
        'JJS':[wn.ADJ, wn.ADJ_SAT], # adj., superlative (wildest)           
        'LS':None, # list item marker (1, 2, One)          
        'MD':None, # modal (can, should)                    
        'NN':wn.NOUN, # noun, sing. or mass (llama)          
        'NNS':wn.NOUN, # noun, plural (llamas)                  
        'NNP':wn.NOUN, # proper noun, sing. (IBM)              
        'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
        'PDT':[wn.ADJ, wn.ADJ_SAT], # predeterminer (all, both)            
        'POS':None, # possessive ending (’s )               
        'PRP':None, # personal pronoun (I, you, he)     
        'PRP$':None, # possessive pronoun (your, one’s)    
        'RB':wn.ADV, # adverb (quickly, never)            
        'RBR':wn.ADV, # adverb, comparative (faster)        
        'RBS':wn.ADV, # adverb, superlative (fastest)     
        'RP':[wn.ADJ, wn.ADJ_SAT], # particle (up, off)
        'SYM':None, # symbol (+,%, &)
        'TO':None, # “to” (to)
        'UH':None, # interjection (ah, oops)
        'VB':wn.VERB, # verb base form (eat)
        'VBD':wn.VERB, # verb past tense (ate)
        'VBG':wn.VERB, # verb gerund (eating)
        'VBN':wn.VERB, # verb past participle (eaten)
        'VBP':wn.VERB, # verb non-3sg pres (eat)
        'VBZ':wn.VERB, # verb 3sg pres (eats)
        'WDT':None, # wh-determiner (which, that)
        'WP':None, # wh-pronoun (what, who)
        'WP$':None, # possessive (wh- whose)
        'WRB':None, # wh-adverb (how, where)
        '$':None, #  dollar sign ($)
        '#':None, # pound sign (#)
        '“':None, # left quote (‘ or “)
        '”':None, # right quote (’ or ”)
        "''":None,
        '(':None, # left parenthesis ([, (, {, <)
        ')':None, # right parenthesis (], ), }, >)
        ',':None, # comma (,)
        '.':None, # sentence-final punc (. ! ?)
        ':':None # mid-sentence punc (: ; ... – -)
    }
processed_df['word_tokens_no_stop_adjusted_pos'] = processed_df[['pos_tags', 'word_tokens_no_stop_words']].apply(
    lambda x: [w[1] for s in x['pos_tags'] for w in s], axis=1)
processed_df

Unnamed: 0,tweet,sentence_tokens,pos_tags,word_tokens,word_tokens_no_stop_words,word_tokens_no_stop_stemmed,word_tokens_no_stop_adjusted_pos
0,"- Awww, that's a bummer. You shoulda got Da...","[ - Awww, that's a bummer., You shoulda got D...","[[(-, :), (Awww, NN), (,, ,), (that, IN), (', ...","[-, Awww, ,, that, ', s, a, bummer, ., You, sh...","[-, Awww, ,, ', bummer, ., shoulda, got, David...","[-, awww, ,, ', bummer, ., shoulda, got, david...","[:, NN, ,, IN, '', VB, DT, NN, ., PRP, VBP, VB..."
1,is upset that he can't update his Facebook by ...,[is upset that he can't update his Facebook by...,"[[(is, VBZ), (upset, JJ), (that, IN), (he, PRP...","[is, upset, that, he, can, ', t, update, his, ...","[upset, ', update, Facebook, texting, ..., mig...","[upset, ', updat, facebook, text, ..., might, ...","[VBZ, JJ, IN, PRP, MD, '', VB, VB, PRP$, NNP, ..."
2,I dived many times for the ball. Managed to s...,"[ I dived many times for the ball., Managed to...","[[(I, PRP), (dived, VBD), (many, JJ), (times, ...","[I, dived, many, times, for, the, ball, ., Man...","[dived, many, times, ball, ., Managed, save, 5...","[dive, mani, time, ball, ., manag, save, 50, %...","[PRP, VBD, JJ, NNS, IN, DT, NN, ., VBN, TO, VB..."
3,my whole body feels itchy and like its on fire,[my whole body feels itchy and like its on fire],"[[(my, PRP$), (whole, JJ), (body, NN), (feels,...","[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]","[whole, bodi, feel, itchi, like, fire]","[PRP$, JJ, NN, NNS, VBP, CC, VBP, PRP$, IN, NN]"
4,"no, it's not behaving at all. i'm mad. why am...","[ no, it's not behaving at all., i'm mad., why...","[[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB...","[no, ,, it, ', s, not, behaving, at, all, ., i...","[,, ', behaving, ., ', mad, ., ?, ', see, .]","[,, ', behav, ., ', mad, ., ?, ', see, .]","[DT, ,, PRP, '', VBZ, RB, VBG, IN, DT, ., NN, ..."
...,...,...,...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,"[Just woke up., Having no school is the best f...","[[(Just, RB), (woke, VBD), (up, RP), (., .)], ...","[Just, woke, up, ., Having, no, school, is, th...","[woke, ., school, best, feeling, ever]","[woke, ., school, best, feel, ever]","[RB, VBD, RP, ., VBG, DT, NN, VBZ, DT, JJS, NN..."
1599996,TheWDB.com - Very cool to hear old Walt interv...,[TheWDB.com - Very cool to hear old Walt inter...,"[[(TheWDB, NNP), (., .), (com, NN), (-, :), (V...","[TheWDB, ., com, -, Very, cool, to, hear, old,...","[TheWDB, ., com, -, cool, hear, old, Walt, int...","[thewdb, ., com, -, cool, hear, old, walt, int...","[NNP, ., NN, :, RB, JJ, TO, VB, JJ, NNP, NN, ...."
1599997,Are you ready for your MoJo Makeover? Ask me f...,"[Are you ready for your MoJo Makeover?, Ask me...","[[(Are, NNP), (you, PRP), (ready, JJ), (for, I...","[Are, you, ready, for, your, MoJo, Makeover, ?...","[ready, MoJo, Makeover, ?, Ask, details]","[readi, mojo, makeov, ?, ask, detail]","[NNP, PRP, JJ, IN, PRP$, NNP, NNP, ., VB, PRP,..."
1599998,Happy 38th Birthday to my boo of alll time!!! ...,[Happy 38th Birthday to my boo of alll time!!!...,"[[(Happy, JJ), (38th, CD), (Birthday, NN), (to...","[Happy, 38th, Birthday, to, my, boo, of, alll,...","[Happy, 38th, Birthday, boo, alll, time, !!!, ...","[happi, 38th, birthday, boo, alll, time, !!!, ...","[JJ, CD, NN, TO, PRP$, NN, IN, JJ, NN, NN, NNP..."


In [None]:
# Lemmatizing words in Tweet
lemmatizer = WordNetLemmatizer()
# processed_df['word_tokens_no_stop_lemmatized'] = processed_df['word_tokens_no_stop_words'].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])

In [None]:
lemmatizer.lemmatize("cannot")

In [None]:
processed_df

In [None]:
processed_df.iloc[4]['pos_tags']

# GloVe

### Download GloVe Twitter Pre-Trained Vectors

In [None]:
r = requests.get('http://nlp.stanford.edu/data/glove.twitter.27B.zip', stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("GloVe.Twitter.27B")