In [2]:
#Authors: Adam Lewandowski, Ivan Sladkov, Patrick English
import numpy as np
import tensorflow as tf
import pandas as pd
import string
import nltk
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import HashingVectorizer

In [36]:
raw_tweets_df = pd.read_csv("data/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", header=None)
raw_tweets_df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [37]:
print(raw_tweets_df[1])
#Baseline processing
tknzr = TweetTokenizer()

#This can take a while
tokenized_tweets = raw_tweets_df[5].copy().apply(lambda x : x.lower())

# If you don't want to wait
#tokenized_tweets = raw_tweets_df[5][:3200].copy().apply(lambda x : x.lower().encode('ascii', 'ignore'))

tokenized_tweets = tokenized_tweets.apply(lambda x : tknzr.tokenize(x))

0          1467810369
1          1467810672
2          1467810917
3          1467811184
4          1467811193
              ...    
1599995    2193601966
1599996    2193601969
1599997    2193601991
1599998    2193602064
1599999    2193602129
Name: 1, Length: 1600000, dtype: int64


In [3]:
#Should things like ... stay? 
def remove_punctuation(tweets):
    t = tweets.copy()
    def remove_punct(tokens):
        if '.' in tokens:
            tokens.remove('.')
        if '?' in tokens:
            tokens.remove('?')
        if '!' in tokens:
            tokens.remove('!')
        if ',' in tokens:
            tokens.remove(',')
        if '\'' in tokens:
            tokens.remove('\'')
        if '"' in tokens:
            tokens.remove('"')
        if '#' in tokens:
            tokens.remove('#')
        if '' in tokens:
            tokens.remove('#')
            
        return tokens
    return t.apply(lambda tweet : remove_punct(tweet))

def remove_links(tweets):
    return tweets

def remove_stop_words(tweets):
    t = tweets.copy()
    from nltk.corpus import stopwords
    sw = set(stopwords.words('english')) 
    return t.apply(lambda tweet:[w for w in tweet if not w in sw])

def merge_tweets(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: ' '.join(tweet))

def pos_tag(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: nltk.pos_tag(tweet))

def merge_lists(tweets):
    list_ = tweets.tolist()
    return [j for i in list_ for j in i]

#TBD
def emoticon_transcoder(tweets):
    return tweets

def remove_pos_tags(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: [w[0] for w in tweet])

def identity_tokenizer(text):
    return text

def scikit_vectorize(tweets, min_freq):
    t = tweets.copy()
    vectorizer = TfidfVectorizer(lowercase=False, tokenizer=identity_tokenizer, min_df=min_freq, dtype=np.float32)
    vectors = vectorizer.fit_transform(t)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    return pd.DataFrame(denselist, columns=feature_names)

def scikit_hash_vectorize(tweets):
    t = tweets.copy()
    vectorizer = HashingVectorizer(lowercase=False, n_features=2**8, tokenizer=identity_tokenizer, dtype=np.float32)
    vectors = vectorizer.fit_transform(t)
    dense = vectors.todense()
    denselist = dense.tolist()
    return pd.DataFrame(denselist)

def str_list_to_list(tweets):
    t = tweets.copy()
    import ast 
    t["tweets"] = t["tweets"].apply(lambda x: ast.literal_eval(x))
    return t

def transcode_emoticons(tweets):
    t = tweets.copy()
    eyes = [";",":","=", "8-"]
    positive_jaw = [")", "]","}",">","3","d","*"] 
    negative_jaw = ["(","[","{","<","/","\\","c"]
    surprised_jaw = ["o"]
    tounge_jaw = ["p"]
    def transcode_emoticon(tweet):
        for i, word in enumerate(tweet):
            eye_present = any(eye in eyes for eye in word)
            if word == "x-d" or word == "xd":
                tweet[i] = ":)"
            elif eye_present and any(jaw in positive_jaw for jaw in word):
                tweet[i] = ":)"
            elif eye_present and any(jaw in negative_jaw for jaw in word):
                tweet[i] = ":("
            elif eye_present and any(jaw in surprised_jaw for jaw in word):
                tweet[i] = ":o"
            elif eye_present and any(jaw in tounge_jaw for jaw in word):
                tweet[i] = ":p"
        return tweet
    return t.apply(lambda tweet: transcode_emoticon(tweet))

def miniscule_transcoder(tweets):
    t = tweets.copy()
    def transcode_tweet(tweet):
        for i, word in enumerate(tweet):
            if "haha" in word or "hehe" in word:
                tweet[i] = "ha"
            elif word.startswith("http"):
                tweet[i] = "httpLink"
            elif word == "im":
                tweet[i] = "i'm"
            elif word =="u":
                tweet[i] = "you"
            elif word == "cant":
                tweet[i] = "can't"
            elif word == "thats":
                tweet[i] = "that's"
            elif "aww" in word:
                tweet[i] = "aww"
            elif word.startswith("@"):
                tweet[i] ="@"
            elif word =="ppl":
                tweet[i] = "people"
            elif ".." in word:
                tweet[i] = "..."
                
        return tweet
    return t.apply(lambda tweet : transcode_tweet(tweet))

def lemmatize(tweets):
    t = tweets.copy()
    
    def get_wordnet_pos(treebank_tag):
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lem = WordNetLemmatizer()
    def lemmatize_word(w):
        transcoded_tag = get_wordnet_pos(w[1])
        wl = lem.lemmatize(w[0], transcoded_tag)
        return (wl,w[1])
    return t.apply(lambda tweet:[lemmatize_word(w) for w in tweet])

def remove_punct(tokens):
        if '.' in tokens:
            tokens.remove('.')
        if '?' in tokens:
            tokens.remove('?')
        if '!' in tokens:
            tokens.remove('!')
        if ',' in tokens:
            tokens.remove(',')
        return tokens
    
def remove_one_char_trash(tweets):
    return tweets.apply(lambda tweet: [s for s in tweet if len(s) >= 2])

In [39]:
#Pre-processing
#comment lines to skip some steps
#You need to run things in this order
tweets = remove_punctuation(tokenized_tweets)
tweets = remove_one_char_trash(tweets)
tweets = remove_stop_words(tweets)
tweets = pos_tag(tweets)
tweets = lemmatize(tweets)
tweets = remove_pos_tags(tweets)
tweets = miniscule_transcoder(tweets)
tweets = transcode_emoticons(tweets)

#tweets = merge_tokens(tweets)

In [40]:
#sth = ' '.join(tweets)
from collections import Counter
#sth2 = Counter(' '.join(merge_tweets(tweets).tolist()).most_common(100))
#print(transcode_emoticons(tweets))
#print(tweets)
counts = Counter(merge_lists(tweets)).most_common(500)
print(counts)

[('@', 785726), ('...', 378197), ("i'm", 181669), ('get', 171575), ('go', 164347), ('day', 103777), ('good', 99541), ('work', 86811), ('like', 82374), ('love', 75246), ('httpLink', 71581), ('today', 67368), ('time', 65426), ("can't", 63435), ('ha', 60907), ('lol', 58693), ('think', 58331), ('know', 57634), ('back', 57093), ('see', 57057), ('well', 56928), ('want', 56622), ('miss', 56523), ('one', 56378), ('make', 55760), ('really', 49981), ('feel', 45813), ('come', 44888), ('night', 44282), ('still', 43545), ('need', 43230), ('new', 42657), ('watch', 41987), ('home', 40358), ('thanks', 40134), ('look', 39436), ('oh', 39378), ('much', 36959), ('last', 35969), ('say', 35076), ('morning', 34855), ('hope', 34455), ('twitter', 34370), ('great', 34178), ('tomorrow', 33750), ('wish', 33341), ('sleep', 32738), ('wait', 32362), ('take', 32307), ('bad', 31961), ("that's", 31397), ('sad', 29602), ('fun', 28559), ('right', 28000), ('week', 27778), ('happy', 27336), ('try', 27004), ('would', 26773)

In [41]:
tw = raw_tweets_df.drop(columns=[1,2,3,4,5])
tw["target"] =tw[0]
tw = tw.drop(columns=[0])
tw["tweets"] = tweets
print(tw)
print(tokenized_tweets)

         target                                             tweets
0             0  [@, httpLink, aww, that's, bummer, shoulda, ge...
1             0  [upset, can't, update, facebook, texting, ...,...
2             0  [@, dive, many, time, ball, manage, save, 50, ...
3             0             [whole, body, feel, itchy, like, fire]
4             0                  [@, behave, i'm, mad, can't, see]
...         ...                                                ...
1599995       4                [woke, school, best, feeling, ever]
1599996       4  [thewdb.com, cool, hear, old, walt, interview,...
1599997       4               [ready, mojo, makeover, ask, detail]
1599998       4  [happy, 38th, birthday, boo, alll, time, tupac...
1599999       4                  [happy, #charitytuesday, @, @, @]

[1600000 rows x 2 columns]
0          [@switchfoot, http://twitpic.com/2y1zl, -, aww...
1          [is, upset, that, he, can't, update, his, face...
2          [@kenichan, i, dived, many, times, 

In [42]:
if os.path.exists("processed_tweets.csv"):
    os.remove("processed_tweets.csv")

tw.to_csv("processed_tweets.csv", encoding = "ISO-8859-1", index=False)


In [4]:
tweets = pd.read_csv("processed_tweets.csv", encoding = "ISO-8859-1")
tweets = str_list_to_list(tweets)

In [5]:
print(tweets)
#tweetsVector = scikit_vectorize(tweets, 2)

#print(tweets.array)
#tweetsVector
#print(tweetsVector)

         target                                             tweets
0             0  [@, httpLink, aww, that's, bummer, shoulda, ge...
1             0  [upset, can't, update, facebook, texting, ...,...
2             0  [@, dive, many, time, ball, manage, save, 50, ...
3             0             [whole, body, feel, itchy, like, fire]
4             0                  [@, behave, i'm, mad, can't, see]
...         ...                                                ...
1599995       4                [woke, school, best, feeling, ever]
1599996       4  [thewdb.com, cool, hear, old, walt, interview,...
1599997       4               [ready, mojo, makeover, ask, detail]
1599998       4  [happy, 38th, birthday, boo, alll, time, tupac...
1599999       4                  [happy, #charitytuesday, @, @, @]

[1600000 rows x 2 columns]


In [6]:
tweetsVector = scikit_hash_vectorize(tweets["tweets"])



In [7]:
if os.path.exists("vec.csv"):
    os.remove("vec.csv")

tweetsVector.to_csv("vec.csv", encoding = "ISO-8859-1", index=False)

In [8]:
#print(len(tweetsVector.iloc[0]))
print(type(tweets))
print(tweets["tweets"])
print(raw_tweets_df)

<class 'pandas.core.frame.DataFrame'>
0          [@, httpLink, aww, that's, bummer, shoulda, ge...
1          [upset, can't, update, facebook, texting, ...,...
2          [@, dive, many, time, ball, manage, save, 50, ...
3                     [whole, body, feel, itchy, like, fire]
4                          [@, behave, i'm, mad, can't, see]
                                 ...                        
1599995                  [woke, school, best, feeling, ever]
1599996    [thewdb.com, cool, hear, old, walt, interview,...
1599997                 [ready, mojo, makeover, ask, detail]
1599998    [happy, 38th, birthday, boo, alll, time, tupac...
1599999                    [happy, #charitytuesday, @, @, @]
Name: tweets, Length: 1600000, dtype: object


NameError: name 'raw_tweets_df' is not defined

In [9]:
print(tweetsVector)

         0    1    2    3    4         5         6    7    8    9    ...  246  \
0        0.0  0.0  0.0  0.0  0.0  0.288675  0.000000  0.0  0.0  0.0  ...  0.0   
1        0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   
2        0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   
3        0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   
4        0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   
...      ...  ...  ...  ...  ...       ...       ...  ...  ...  ...  ...  ...   
1599995  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   
1599996  0.0  0.0  0.0  0.0  0.0  0.000000 -0.377964  0.0  0.0  0.0  ...  0.0   
1599997  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   
1599998  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   
1599999  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.0  ...  0.0   

         247       248  249