In [1]:
import pandas as pd
import requests, zipfile, io, re, nltk
from datetime import datetime
import tensorflow as tf
from keras import models, layers
from keras.preprocessing.text import Tokenizer 
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\techn\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Importing Data

In [2]:
raw_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None,
                         names=['polarity', 'id', 'date', 'query', 'user', 'tweet'])
raw_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


### Removing Unnecessary Data

In [3]:
df = raw_df.drop(columns=['id', 'query', 'polarity', 'user', 'date'])
# df['datetime'] = raw_df['date'].apply(lambda x: pd.to_datetime(x.replace('PDT ', '')))
df

Unnamed: 0,tweet
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."
...,...
1599995,Just woke up. Having no school is the best fee...
1599996,TheWDB.com - Very cool to hear old Walt interv...
1599997,Are you ready for your MoJo Makeover? Ask me f...
1599998,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
y = raw_df['polarity']
print(f"Unique Elements of y: {pd.unique(y)}")
# Change y from [0, 4] to [0, 1]
y = y.apply(lambda x: 1 if x==4 else 0)
y

Unique Elements of y: [0 4]


0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: polarity, Length: 1600000, dtype: int64

# Data Preprocessing

In [7]:
processed_df = df.copy(deep=True)
tokenizer = RegexpTokenizer(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)|\w+|[^\w\s]+")


# Remove URLs and User Mentions. All Twitter handles must be within 4 to 15 characters
processed_df['tweet'] = processed_df['tweet'].apply(lambda x: re.sub(r"http\S+|@\w{4,15}", "", x))
processed_df

Unnamed: 0,tweet
0,"- Awww, that's a bummer. You shoulda got Da..."
1,is upset that he can't update his Facebook by ...
2,I dived many times for the ball. Managed to s...
3,my whole body feels itchy and like its on fire
4,"no, it's not behaving at all. i'm mad. why am..."
...,...
1599995,Just woke up. Having no school is the best fee...
1599996,TheWDB.com - Very cool to hear old Walt interv...
1599997,Are you ready for your MoJo Makeover? Ask me f...
1599998,Happy 38th Birthday to my boo of alll time!!! ...


In [6]:
# Tokenize Tweets into Sentences
processed_df['sentence_tokens'] = processed_df['tweet'].apply(lambda x: sent_tokenize(x))

In [None]:
# Extract Part-Of-Speech Tags
processed_df['pos_tags'] = processed_df['sentence_tokens'].apply(lambda x: [nltk.pos_tag(tokenizer.tokenize(sent)) for sent in x])
processed_df

In [8]:
# pattern = regex.compile(r"(.)/\1{2,}")
# pattern.sub(r"\1\1\1", text)
# Tokenize Tweets into Words
processed_df['word_tokens'] = processed_df['tweet'].apply(lambda x: tokenizer.tokenize(x))
processed_df

Unnamed: 0,tweet,word_tokens
0,"- Awww, that's a bummer. You shoulda got Da...","[-, Awww, ,, that, ', s, a, bummer, ., You, sh..."
1,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can, ', t, update, his, ..."
2,I dived many times for the ball. Managed to s...,"[I, dived, many, times, for, the, ball, ., Man..."
3,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,"no, it's not behaving at all. i'm mad. why am...","[no, ,, it, ', s, not, behaving, at, all, ., i..."
...,...,...
1599995,Just woke up. Having no school is the best fee...,"[Just, woke, up, ., Having, no, school, is, th..."
1599996,TheWDB.com - Very cool to hear old Walt interv...,"[TheWDB, ., com, -, Very, cool, to, hear, old,..."
1599997,Are you ready for your MoJo Makeover? Ask me f...,"[Are, you, ready, for, your, MoJo, Makeover, ?..."
1599998,Happy 38th Birthday to my boo of alll time!!! ...,"[Happy, 38th, Birthday, to, my, boo, of, alll,..."


In [9]:
# Generate stop words
print("Stop Words: ", stopwords.words('english'))
stop_words = set(stopwords.words('english'))
# Remove stop words from tokenized tweets
processed_df['word_tokens_no_stop_words'] = processed_df['word_tokens'].apply(lambda x: [w for w in x if not w.lower() in stop_words])
processed_df

Stop Words:  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 's

Unnamed: 0,tweet,word_tokens,word_tokens_no_stop_words
0,"- Awww, that's a bummer. You shoulda got Da...","[-, Awww, ,, that, ', s, a, bummer, ., You, sh...","[-, Awww, ,, ', bummer, ., shoulda, got, David..."
1,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can, ', t, update, his, ...","[upset, ', update, Facebook, texting, ..., mig..."
2,I dived many times for the ball. Managed to s...,"[I, dived, many, times, for, the, ball, ., Man...","[dived, many, times, ball, ., Managed, save, 5..."
3,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]"
4,"no, it's not behaving at all. i'm mad. why am...","[no, ,, it, ', s, not, behaving, at, all, ., i...","[,, ', behaving, ., ', mad, ., ?, ', see, .]"
...,...,...,...
1599995,Just woke up. Having no school is the best fee...,"[Just, woke, up, ., Having, no, school, is, th...","[woke, ., school, best, feeling, ever]"
1599996,TheWDB.com - Very cool to hear old Walt interv...,"[TheWDB, ., com, -, Very, cool, to, hear, old,...","[TheWDB, ., com, -, cool, hear, old, Walt, int..."
1599997,Are you ready for your MoJo Makeover? Ask me f...,"[Are, you, ready, for, your, MoJo, Makeover, ?...","[ready, MoJo, Makeover, ?, Ask, details]"
1599998,Happy 38th Birthday to my boo of alll time!!! ...,"[Happy, 38th, Birthday, to, my, boo, of, alll,...","[Happy, 38th, Birthday, boo, alll, time, !!!, ..."


In [None]:
# Source: https://github.com/nltk/nltk/blob/develop/nltk/stem/wordnet.py
def penn2morphy(penntag) -> str:
    """
    Converts tags from Penn format (input: single string) to Morphy.
    """
    morphy_tag = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return None

processed_df['pos_tags_adjusted_no_stop_words'] = processed_df['pos_tags'].apply(
    lambda x: [(w[0], penn2morphy(w[1])) for s in x for w in s if not w[0].lower() in stop_words])
processed_df

In [10]:
# Stemming words in Tweet
ps = PorterStemmer()
processed_df['word_tokens_no_stop_stemmed'] = processed_df['word_tokens_no_stop_words'].apply(lambda x: [ps.stem(w) for w in x if not w == ''])
processed_df

Unnamed: 0,tweet,word_tokens,word_tokens_no_stop_words,word_tokens_no_stop_stemmed
0,"- Awww, that's a bummer. You shoulda got Da...","[-, Awww, ,, that, ', s, a, bummer, ., You, sh...","[-, Awww, ,, ', bummer, ., shoulda, got, David...","[-, awww, ,, ', bummer, ., shoulda, got, david..."
1,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can, ', t, update, his, ...","[upset, ', update, Facebook, texting, ..., mig...","[upset, ', updat, facebook, text, ..., might, ..."
2,I dived many times for the ball. Managed to s...,"[I, dived, many, times, for, the, ball, ., Man...","[dived, many, times, ball, ., Managed, save, 5...","[dive, mani, time, ball, ., manag, save, 50, %..."
3,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]","[whole, bodi, feel, itchi, like, fire]"
4,"no, it's not behaving at all. i'm mad. why am...","[no, ,, it, ', s, not, behaving, at, all, ., i...","[,, ', behaving, ., ', mad, ., ?, ', see, .]","[,, ', behav, ., ', mad, ., ?, ', see, .]"
...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,"[Just, woke, up, ., Having, no, school, is, th...","[woke, ., school, best, feeling, ever]","[woke, ., school, best, feel, ever]"
1599996,TheWDB.com - Very cool to hear old Walt interv...,"[TheWDB, ., com, -, Very, cool, to, hear, old,...","[TheWDB, ., com, -, cool, hear, old, Walt, int...","[thewdb, ., com, -, cool, hear, old, walt, int..."
1599997,Are you ready for your MoJo Makeover? Ask me f...,"[Are, you, ready, for, your, MoJo, Makeover, ?...","[ready, MoJo, Makeover, ?, Ask, details]","[readi, mojo, makeov, ?, ask, detail]"
1599998,Happy 38th Birthday to my boo of alll time!!! ...,"[Happy, 38th, Birthday, to, my, boo, of, alll,...","[Happy, 38th, Birthday, boo, alll, time, !!!, ...","[happi, 38th, birthday, boo, alll, time, !!!, ..."


In [None]:
# Lemmatizing words in Tweet
lemmatizer = WordNetLemmatizer()
processed_df['word_tokens_no_stop_lemmatized'] = processed_df['pos_tags_adjusted_no_stop_words'].apply(lambda x: [lemmatizer.lemmatize(word=w[0].lower(), pos=w[1]) if w[1] != None else w[0].lower() for w in x])
processed_df

# GloVe

### Download GloVe Twitter Pre-Trained Vectors

In [None]:
r = requests.get('http://nlp.stanford.edu/data/glove.twitter.27B.zip', stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("GloVe.Twitter.27B")

In [11]:
def summarize_diagnostics(history):
    # plot loss
    plt.subplot(211)
    plt.tight_layout()
    plt.title('Cross Entropy Loss')
    plt.plot(history.history['loss'], color='blue', label='train')
    plt.plot(history.history['val_loss'], color='orange', label='test')
    # plot accuracy
    plt.subplot(212)
    plt.title('Classification Accuracy')
    plt.plot(history.history['accuracy'], color='blue', label='train')
    plt.plot(history.history['val_accuracy'], color='orange', label='test')
    plt.legend()
    plt.show()

In [12]:
max_words = 5000
k_tokenizer = Tokenizer(num_words=max_words)
k_tokenizer.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'])
X = processed_df['word_tokens_no_stop_stemmed'].apply(k_tokenizer.texts_to_sequences)
X

0          [[], [394], [], [2], [1103], [], [3227], [22],...
1          [[637], [2], [245], [465], [388], [], [228], [...
2          [[3837], [251], [21], [885], [], [747], [543],...
3                  [[370], [688], [32], [2669], [16], [917]]
4          [[], [2], [4213], [], [2], [495], [], [], [2],...
                                 ...                        
1599995                [[278], [], [95], [125], [32], [167]]
1599996    [[], [], [253], [], [143], [202], [178], [], [...
1599997                   [[156], [], [], [], [320], [1617]]
1599998    [[73], [], [206], [415], [4559], [21], [], [],...
1599999                                           [[73], []]
Name: word_tokens_no_stop_stemmed, Length: 1600000, dtype: object

In [14]:
max_words = 1000
k_tokenizer = Tokenizer(num_words=max_words)
k_tokenizer.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'])
X1 = processed_df['word_tokens_no_stop_stemmed'].apply(k_tokenizer.texts_to_matrix)
X1

0          [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1          [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
2          [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
3          [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
4          [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
                                 ...                        
1599995    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1599996    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1599997    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1599998    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
1599999    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: word_tokens_no_stop_stemmed, Length: 1600000, dtype: object

In [27]:
import pickle
with open("matrix.pckle",'wb') as f:
    pickle.dump(X1, f)

MemoryError: 

In [None]:
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(28*28,)))
network.add(layers.Dense(64, activation='relu'))
network.add(layers.Dense(1, activation='sigmoid'))
network.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])