In [1]:
import pandas as pd
import numpy as np
import keras

Using TensorFlow backend.


In [19]:
tweetdata = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None, encoding="ISO-8859-1")

In [18]:
len(tweetdata)

1600000

In [26]:
tweetdata.columns=['polarity', 'id', 'date', 'source', 'user', 'text']

In [20]:
tweetdata.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


### Create Hashtag Tweets Dataset

In [21]:
hashtag_ids = []
for i in range(len(tweetdata[5])):
    if '#' in tweetdata[5][i]:
               hashtag_ids.append(i)

In [22]:
len(hashtag_ids)

36812

In [32]:
len(hashtag_ids)/len(tweetdata)

0.021213531494140625

In [23]:
hashtag_tweets = tweetdata.iloc[hashtag_ids]

In [24]:
len(hashtag_tweets)

36812

In [25]:
# hashtag_tweets.to_csv('hashtag_tweets.36812.csv')

# Cleaning

Note to self: start with naive/convenient implementation. When hosting as a service, look to improve efficiency with faster methods.

A lot of the following is determined by domain, topic, data type, data source, experience, and language.


Remove
- integers
- punctuation
- url
- html tags
- other characters

In [None]:
# @\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+

In [27]:
# 1/21/19 https://github.com/hundredblocks/concrete_NLP_tutorial/blob/master/NLP_notebook.ipynb
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

In [28]:
tweetdata = standardize_text(tweetdata, 'text')

In [31]:
# tweetdata.to_csv("1600000.cleaned.url.at.csv")

# Preprocessing

In [32]:
tweetdata.groupby('polarity').count()

Unnamed: 0_level_0,id,date,source,user,text
polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,800000,800000,800000,800000,800000
4,800000,800000,800000,800000,800000


# Embedding

In [6]:
tweetdata = pd.read_csv("1600000.cleaned.url.at.csv", index_col=0)

  mask |= (ar1 == a)


In [8]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

tweetdata["tokens"] = tweetdata["text"].apply(tokenizer.tokenize)
tweetdata.head()

Unnamed: 0,polarity,id,date,source,user,text,tokens
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"awww, that's a bummer you shoulda got da...","[awww, that, s, a, bummer, you, shoulda, got, ..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his facebook by ...,"[is, upset, that, he, can, t, update, his, fac..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,i dived many times for the ball managed to s...,"[i, dived, many, times, for, the, ball, manage..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"no, it's not behaving at all i'm mad why am...","[no, it, s, not, behaving, at, all, i, m, mad,..."


In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

all_words = [word for tokens in tweetdata["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in tweetdata["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

21011497 words total, with a vocabulary size of 283657
Max sentence length is 53


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = tweetdata["text"].tolist()
list_labels = tweetdata["polarity"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=40)

X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [15]:
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


def plot_LSA(test_data, test_labels, savepath="PCA_demo.csv", plot=True):
        lsa = TruncatedSVD(n_components=2)
        lsa.fit(test_data)
        lsa_scores = lsa.transform(test_data)
        color_mapper = {label:idx for idx,label in enumerate(set(test_labels))}
        color_column = [color_mapper[label] for label in test_labels]
        colors = ['orange','blue','blue']
        if plot:
            plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
            red_patch = mpatches.Patch(color='orange', label='Positive')
            green_patch = mpatches.Patch(color='blue', label='Negative')
            plt.legend(handles=[red_patch, green_patch], prop={'size': 30})


fig = plt.figure(figsize=(16, 16))          
plot_LSA(X_train_counts, y_train)
plt.show()

<Figure size 1600x1600 with 1 Axes>

In [55]:
# Hashtag REGEX