In [44]:
#Authors: Adam Lewandowski, Ivan Sladkov, Patrick English
import numpy as np
import tensorflow as tf
import pandas as pd
import string
import nltk

#from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

In [45]:
raw_tweets_df = pd.read_csv("data/training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", header=None)
raw_tweets_df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [46]:
#Baseline processing
tknzr = TweetTokenizer()
#tweets = raw_tweets_df[5].copy().apply(lambda x : x.lower())

# If you don't want to wait
tokenized_tweets = raw_tweets_df[5][:1600].copy().apply(lambda x : x.lower())

#This can take a while
tokenized_tweets = tokenized_tweets.apply(lambda x : tknzr.tokenize(x))

In [47]:
#Should things like ... stay? 
def remove_punctuation(tweets):
    t = tweets.copy()
    def remove_punct(tokens):
        if '.' in tokens:
            tokens.remove('.')
        elif '?' in tokens:
            tokens.remove('?')
        elif '!' in tokens:
            tokens.remove('!')
        elif ',' in tokens:
            tokens.remove(',')
        return tokens
    return t.apply(lambda tweet : remove_punct(tweet))

def remove_stop_words(tweets):
    t = tweets.copy()
    from nltk.corpus import stopwords
    sw = set(stopwords.words('english')) 
    return t.apply(lambda tweet:[w for w in tweet if not w in sw])

#TODO add spell correction
def correct_spelling(tweets):
    return tweets

def pos_tag(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: nltk.pos_tag(tweet))

#TBD
def emoticon_transcoder(tweets):
    return tweets

def remove_pos_tags(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: [w[0] for w in tweet])

def identity_tokenizer(text):
    return text

def scikit_vectorize(tweets):
    t = tweets.copy()
    vectorizer = TfidfVectorizer(lowercase=False, tokenizer=identity_tokenizer, min_df=2)
    vectors = vectorizer.fit_transform(t)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    return pd.DataFrame(denselist, columns=feature_names)

def merge_tokens(tweets):
    t = tweets.copy()
    return t.apply(lambda tweet: ' '.join(tweet))



def lemmatize(tweets):
    t = tweets.copy()
    
    def get_wordnet_pos(treebank_tag):
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lem = WordNetLemmatizer()
    def lemmatize_word(w):
        transcoded_tag = get_wordnet_pos(w[1])
        wl = lem.lemmatize(w[0], transcoded_tag)
        return (wl,w[1])
    return t.apply(lambda tweet:[lemmatize_word(w) for w in tweet])

In [48]:
#Pre-processing
#comment lines to skip some steps
#You need to run things in this order
tweets = remove_punctuation(tokenized_tweets)
tweets = remove_stop_words(tweets)
tweets = pos_tag(tweets)
tweets = lemmatize(tweets)
tweets = remove_pos_tags(tweets)
#tweets = merge_tokens(tweets)

In [49]:
tweetsVector = scikit_vectorize(tweets.array)
print(tweetsVector)

             !    "    #  #24  #fail  #fb    $         %    &    '  ...   yo  \
0     0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
1     0.139964  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
2     0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.342562  0.0  0.0  ...  0.0   
3     0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
4     0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
...        ...  ...  ...  ...    ...  ...  ...       ...  ...  ...  ...  ...   
1595  0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
1596  0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
1597  0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
1598  0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   
1599  0.000000  0.0  0.0  0.0    0.0  0.0  0.0  0.000000  0.0  0.0  ...  0.0   

      youtube   yr  yucky  yup    ~   ½