In [7]:
#Authors: Adam Lewandowski, Ivan Sladkov, Patrick English
import numpy as np
import tensorflow as tf
import pandas as pd
import string
import nltk
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import HashingVectorizer

In [8]:
raw_tweets_df = pd.read_csv("data/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", header=None)
raw_tweets_df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [9]:
#Baseline processing
tknzr = TweetTokenizer()

#This can take a while
#tokenized_tweets = raw_tweets_df[5].copy().apply(lambda x : x.lower())

# If you don't want to wait, vary slice value
slice_ = int(len(raw_tweets_df)*0.1)
tokenized_tweets = raw_tweets_df[5][:slice_].copy().apply(lambda x : x.lower().encode('ascii', 'ignore'))

tokenized_tweets = tokenized_tweets.apply(lambda x : tknzr.tokenize(x))

In [10]:
def remove_punctuation(tweets):
    t = tweets.copy()
    def remove_punct(tokens):
        if '.' in tokens:
            tokens.remove('.')
        if '?' in tokens:
            tokens.remove('?')
        if '!' in tokens:
            tokens.remove('!')
        if ',' in tokens:
            tokens.remove(',')
        if '\'' in tokens:
            tokens.remove('\'')
        if '"' in tokens:
            tokens.remove('"')
        if '#' in tokens:
            tokens.remove('#')
        if '' in tokens:
            tokens.remove('#')
            
        return tokens
    return t.apply(lambda tweet : remove_punct(tweet))

def remove_links(tweets):
    return tweets

def remove_stop_words(tweets):
    t = tweets.copy()
    from nltk.corpus import stopwords
    sw = set(stopwords.words('english')) 
    return t.apply(lambda tweet:[w for w in tweet if not w in sw])

def merge_tweets(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: ' '.join(tweet))

def pos_tag(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: nltk.pos_tag(tweet))

def merge_lists(tweets):
    list_ = tweets.tolist()
    return [j for i in list_ for j in i]

#TBD
def emoticon_transcoder(tweets):
    return tweets

def remove_pos_tags(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: [w[0] for w in tweet])

def identity_tokenizer(text):
    return text

def scikit_vectorize(tweets, min_freq):
    t = tweets.copy()
    vectorizer = TfidfVectorizer(lowercase=False, tokenizer=identity_tokenizer, min_df=min_freq, dtype=np.float32)
    vectors = vectorizer.fit_transform(t)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    return pd.DataFrame(denselist, columns=feature_names)

def scikit_hash_vectorize(tweets, features):
    t = tweets.copy()
    vectorizer = HashingVectorizer(lowercase=False, n_features=features, tokenizer=identity_tokenizer, dtype=np.float32)
    vectors = vectorizer.fit_transform(t)
    dense = vectors.todense()
    denselist = dense.tolist()
    return pd.DataFrame(denselist)

def str_list_to_list(tweets):
    t = tweets.copy()
    import ast 
    t["tweets"] = t["tweets"].apply(lambda x: ast.literal_eval(x))
    return t

def transcode_emoticons(tweets):
    t = tweets.copy()
    eyes = [";",":","=", "8-"]
    positive_jaw = [")", "]","}",">","3","d","*"] 
    negative_jaw = ["(","[","{","<","/","\\","c"]
    surprised_jaw = ["o"]
    tounge_jaw = ["p"]
    def transcode_emoticon(tweet):
        for i, word in enumerate(tweet):
            eye_present = any(eye in eyes for eye in word)
            if word == "x-d" or word == "xd":
                tweet[i] = ":)"
            elif eye_present and any(jaw in positive_jaw for jaw in word):
                tweet[i] = ":)"
            elif eye_present and any(jaw in negative_jaw for jaw in word):
                tweet[i] = ":("
            elif eye_present and any(jaw in surprised_jaw for jaw in word):
                tweet[i] = ":o"
            elif eye_present and any(jaw in tounge_jaw for jaw in word):
                tweet[i] = ":p"
        return tweet
    return t.apply(lambda tweet: transcode_emoticon(tweet))

def miniscule_transcoder(tweets):
    t = tweets.copy()
    def transcode_tweet(tweet):
        for i, word in enumerate(tweet):
            if "haha" in word or "hehe" in word:
                tweet[i] = "ha"
            elif word.startswith("http"):
                tweet[i] = "httpLink"
            elif word == "im":
                tweet[i] = "i'm"
            elif word =="u":
                tweet[i] = "you"
            elif word == "cant":
                tweet[i] = "can't"
            elif word == "thats":
                tweet[i] = "that's"
            elif "aww" in word:
                tweet[i] = "aww"
            elif word.startswith("@"):
                tweet[i] ="@"
            elif word =="ppl":
                tweet[i] = "people"
            elif ".." in word:
                tweet[i] = "..."
                
        return tweet
    return t.apply(lambda tweet : transcode_tweet(tweet))

def lemmatize(tweets):
    t = tweets.copy()
    
    def get_wordnet_pos(treebank_tag):
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lem = WordNetLemmatizer()
    def lemmatize_word(w):
        transcoded_tag = get_wordnet_pos(w[1])
        wl = lem.lemmatize(w[0], transcoded_tag)
        return (wl,w[1])
    return t.apply(lambda tweet:[lemmatize_word(w) for w in tweet])

def remove_punct(tokens):
        if '.' in tokens:
            tokens.remove('.')
        if '?' in tokens:
            tokens.remove('?')
        if '!' in tokens:
            tokens.remove('!')
        if ',' in tokens:
            tokens.remove(',')
        return tokens
    
def remove_one_char_trash(tweets):
    return tweets.apply(lambda tweet: [s for s in tweet if len(s) >= 2])

In [11]:
#Pre-processing
#comment lines to skip some steps
#You need to run things in this order
tweets = remove_punctuation(tokenized_tweets)
tweets = remove_one_char_trash(tweets)
tweets = remove_stop_words(tweets)
tweets = pos_tag(tweets)
tweets = lemmatize(tweets)
tweets = remove_pos_tags(tweets)
tweets = miniscule_transcoder(tweets)
tweets = transcode_emoticons(tweets)

In [12]:
from collections import Counter
counts = Counter(merge_lists(tweets)).most_common(500)
print(counts)

[('@', 59727), ('...', 42591), ("i'm", 20124), ('go', 19523), ('get', 18863), ('work', 12768), ('day', 10853), ("can't", 8833), ('miss', 8756), ('like', 8403), ('today', 8062), ('want', 7684), ('back', 7358), ('good', 6856), ('feel', 6628), ('time', 6554), ('think', 6102), ('really', 6074), ('still', 6037), ('one', 5529), ('know', 5416), ('need', 5358), ('well', 5269), ('make', 5255), ('wish', 5249), ('sleep', 5240), ('sad', 5190), ('home', 5002), ('bad', 4966), ('last', 4881), ('see', 4880), ('night', 4728), ('httpLink', 4648), ('oh', 4440), ('come', 4407), ('lol', 4354), ('sorry', 4078), ('tomorrow', 4062), ('much', 3987), ('love', 3776), ('morning', 3704), ('hate', 3685), ('look', 3602), ('watch', 3588), ('week', 3538), ('school', 3537), ('ha', 3354), ('leave', 3326), ('take', 3260), ('sick', 3085), ('hope', 3079), ('say', 3030), ('try', 3029), ('though', 3022), ('bed', 2952), ('find', 2869), ('twitter', 2869), ('hour', 2841), ('right', 2829), ('new', 2825), ('thing', 2690), ('could

In [13]:
#tw = raw_tweets_df.drop(columns=[1,2,3,4,5])
tw = raw_tweets_df[:slice_].drop(columns=[1,2,3,4,5])
tw["target"] =tw[0]
tw = tw.drop(columns=[0])
tw["tweets"] = tweets
print(tw)

        target                                             tweets
0            0  [@, httpLink, aww, that's, bummer, shoulda, ge...
1            0  [upset, can't, update, facebook, texting, ...,...
2            0  [@, dive, many, time, ball, manage, save, 50, ...
3            0             [whole, body, feel, itchy, like, fire]
4            0                  [@, behave, i'm, mad, can't, see]
...        ...                                                ...
159995       0                       [home, bore, thinking, life]
159996       0  [@, wish, could, tell, stop, tweeting, cause, ...
159997       0  [london, mo, ridiculously, hot, leave, short, ...
159998       0  [one, day, san, jose, we're, officially, vacat...
159999       0  [@, kind, right, though, ember, ..., sometimes...

[160000 rows x 2 columns]


In [14]:
if os.path.exists("data/processed_tweets.csv"):
    os.remove("data/processed_tweets.csv")

tw.to_csv("data/processed_tweets.csv", encoding = "ISO-8859-1", index=False)


In [15]:
tweets = pd.read_csv("data/processed_tweets.csv", encoding = "ISO-8859-1")
tweets = str_list_to_list(tweets)
print(tweets)

        target                                             tweets
0            0  [@, httpLink, aww, that's, bummer, shoulda, ge...
1            0  [upset, can't, update, facebook, texting, ...,...
2            0  [@, dive, many, time, ball, manage, save, 50, ...
3            0             [whole, body, feel, itchy, like, fire]
4            0                  [@, behave, i'm, mad, can't, see]
...        ...                                                ...
159995       0                       [home, bore, thinking, life]
159996       0  [@, wish, could, tell, stop, tweeting, cause, ...
159997       0  [london, mo, ridiculously, hot, leave, short, ...
159998       0  [one, day, san, jose, we're, officially, vacat...
159999       0  [@, kind, right, though, ember, ..., sometimes...

[160000 rows x 2 columns]


In [16]:
#Run either hash or tfidf. You can run both but you might not have enough ram
tweetsVector = scikit_hash_vectorize(tweets["tweets"], 256)
if os.path.exists("data/vec.csv"):
    os.remove("data/vec.csv")
tweetsVector.to_csv("data/vec.csv", encoding = "ISO-8859-1", index=False)
print(tweetsVector)



        0    1    2    3    4         5    6         7    8    9    ...  246  \
0       0.0  0.0  0.0  0.0  0.0  0.288675  0.0  0.000000  0.0  0.0  ...  0.0   
1       0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   
2       0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   
3       0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   
4       0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   
...     ...  ...  ...  ...  ...       ...  ...       ...  ...  ...  ...  ...   
159995  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   
159996  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   
159997  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   
159998  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.316228  0.0  0.0  ...  0.0   
159999  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  ...  0.0   

        247  248  249  250      251  25

In [17]:
#You need to experiment with values. The lower the better, but you might not have enough ram.
#If you don't have enough RAM it will cause jupyter-notebook kernel to die
tweetsTfidfVector = scikit_vectorize(tweets["tweets"], 5000)
if os.path.exists("data/vecTfidf.csv"):
    os.remove("data/vecTfidf.csv")
tweetsTfidfVector.to_csv("data/vecTfidf.csv", encoding = "ISO-8859-1", index=False)
print(tweetsTfidfVector)

             ...         @  back     can't       day      feel       get  \
0       0.000000  0.380508   0.0  0.000000  0.703138  0.000000  0.600675   
1       0.416314  0.000000   0.0  0.635806  0.000000  0.000000  0.000000   
2       0.000000  0.358120   0.0  0.000000  0.000000  0.000000  0.000000   
3       0.000000  0.000000   0.0  0.000000  0.000000  0.726113  0.000000   
4       0.000000  0.373908   0.0  0.722608  0.000000  0.000000  0.000000   
...          ...       ...   ...       ...       ...       ...       ...   
159995  0.000000  0.000000   0.0  0.000000  0.000000  0.000000  0.000000   
159996  0.000000  0.349693   0.0  0.000000  0.000000  0.000000  0.000000   
159997  0.000000  0.000000   0.0  0.000000  0.000000  0.000000  0.000000   
159998  0.000000  0.000000   0.0  0.000000  0.566589  0.000000  0.484024   
159999  0.930032  0.367479   0.0  0.000000  0.000000  0.000000  0.000000   

              go  good       i'm  ...  really  sad  still  think      time  \
0       0