In [1]:
# import data and drop unnecessary columns
import pandas as pd
import re
import pandas as pd 
import numpy as np 

# if we do not add engine='python', it will throw out a Unicode error
df = pd.read_csv('Apple-Twitter-Sentiment-DFE.csv', header=None, sep=',', engine='python')

sentiments = df.iloc[:, 5]
text = df.iloc[:, 11]

sentiments = np.array(sentiments)
text = np.array(text)

data = {'sentiment': sentiments[1:], 'text': text[1:]}

df = pd.DataFrame(data)
df

Unnamed: 0,sentiment,text
0,3,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,3,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,3,My cat only chews @apple cords. Such an #Apple...
3,3,I agree with @jimcramer that the #IndividualIn...
4,3,Nobody expects the Spanish Inquisition #AAPL
...,...,...
3881,3,(Via FC) Apple Is Warming Up To Social Media -...
3882,3,RT @MMLXIV: there is no avocado emoji may I as...
3883,5,@marcbulandr I could not agree more. Between @...
3884,1,My iPhone 5's photos are no longer downloading...


In [2]:
# change sentiment scores to be consistent with the train set
df = df.loc[(df["sentiment"] != 'not_relevant') & (df["sentiment"] != '3')]
df = df.reset_index()
numeric_cols = pd.to_numeric(df['sentiment'])
df['sentiment'] = numeric_cols

df['sentiment'].replace(to_replace=1, value=0, inplace=True)
df['sentiment'].replace(to_replace=5, value=1, inplace=True)
df = df.reset_index()
df = df.drop(['level_0', 'index'], axis=1)
df

Unnamed: 0,sentiment,text
0,1,Top 3 all @Apple #tablets. Damn right! http://...
1,1,CNBCTV: #Apple's margins better than expected?...
2,0,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
3,1,RT @peterpham: Bought my @AugustSmartLock at t...
4,0,@apple Contact sync between Yosemite and iOS8 ...
...,...,...
1637,1,RT @shannonmmiller: Love the @Apple is support...
1638,0,hey @apple is it normal for my laptop charger ...
1639,1,@marcbulandr I could not agree more. Between @...
1640,0,My iPhone 5's photos are no longer downloading...


In [3]:
# other cleanings (same as the training corpus)

In [4]:
def remove_mentions(data):
    return re.sub('@[\w]*', ' ', data)

df['text_cleaned']= df['text'].apply(lambda x: remove_mentions(x))

def remove_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)

df['text_cleaned']= df['text_cleaned'].apply(lambda x: remove_URLs(x))
df.head()


Unnamed: 0,sentiment,text,text_cleaned
0,1,Top 3 all @Apple #tablets. Damn right! http://...,Top 3 all #tablets. Damn right!
1,1,CNBCTV: #Apple's margins better than expected?...,CNBCTV: #Apple's margins better than expected?...
2,0,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
3,1,RT @peterpham: Bought my @AugustSmartLock at t...,RT : Bought my at the store..pretty good ...
4,0,@apple Contact sync between Yosemite and iOS8 ...,Contact sync between Yosemite and iOS8 is se...


In [5]:
df['text_cleaned'] = df['text_cleaned'].str.replace('[^a-zA-Z#]', ' ')

df.head(10)

Unnamed: 0,sentiment,text,text_cleaned
0,1,Top 3 all @Apple #tablets. Damn right! http://...,Top all #tablets Damn right
1,1,CNBCTV: #Apple's margins better than expected?...,CNBCTV #Apple s margins better than expected ...
2,0,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,WTF MY BATTERY WAS ONE SECOND AGO AND NOW ...
3,1,RT @peterpham: Bought my @AugustSmartLock at t...,RT Bought my at the store pretty good ...
4,0,@apple Contact sync between Yosemite and iOS8 ...,Contact sync between Yosemite and iOS is se...
5,0,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,WARNING IF YOU BUY AN IPHONE S UNLOCKED FROM ...
6,0,"@Apple, For the love of GAWD, CENTER the '1'on...",For the love of GAWD CENTER the on the ...
7,0,i get the storage almost full notification lit...,i get the storage almost full notification lit...
8,0,I had to do made the #switch from iPhone 6 to ...,I had to do made the #switch from iPhone to ...
9,0,@ me RT @101Baemations: Can't stand those ppl ...,me RT Can t stand those ppl with sticke...


In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df["text_cleaned"]=df["text_cleaned"].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df.head(10)


Unnamed: 0,sentiment,text,text_cleaned
0,1,Top 3 all @Apple #tablets. Damn right! http://...,top #tablets damn right
1,1,CNBCTV: #Apple's margins better than expected?...,cnbctv #apple margins better expected #aapl
2,0,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,wtf battery one second ago wtf
3,1,RT @peterpham: Bought my @AugustSmartLock at t...,rt bought store pretty good logo match wait in...
4,0,@apple Contact sync between Yosemite and iOS8 ...,contact sync yosemite ios seriously screwed us...
5,0,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,warning buy iphone unlocked iphone cannot use ...
6,0,"@Apple, For the love of GAWD, CENTER the '1'on...",love gawd center damn calendar app fixed back ...
7,0,i get the storage almost full notification lit...,get storage almost full notification literally...
8,0,I had to do made the #switch from iPhone 6 to ...,made #switch iphone galaxy note edge keep
9,0,@ me RT @101Baemations: Can't stand those ppl ...,rt stand ppl stickers everywhere prob bought i...


In [7]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

tokenized_tweet = df['text_cleaned'].apply(lambda x: tknzr.tokenize(x))

tokenized_tweet.head(10)

0                         [top, #tablets, damn, right]
1    [cnbctv, #apple, margins, better, expected, #a...
2                [wtf, battery, one, second, ago, wtf]
3    [rt, bought, store, pretty, good, logo, match,...
4    [contact, sync, yosemite, ios, seriously, scre...
6    [love, gawd, center, damn, calendar, app, fixe...
7    [get, storage, almost, full, notification, lit...
8    [made, #switch, iphone, galaxy, note, edge, keep]
9    [rt, stand, ppl, stickers, everywhere, prob, b...
Name: text_cleaned, dtype: object

In [8]:
from nltk import PorterStemmer

ps = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])

tokenized_tweet.head()


0                          [top, #tablet, damn, right]
1       [cnbctv, #appl, margin, better, expect, #aapl]
2                [wtf, batteri, one, second, ago, wtf]
3    [rt, bought, store, pretti, good, logo, match,...
4    [contact, sync, yosemit, io, serious, screw, u...
Name: text_cleaned, dtype: object

In [9]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

df['text_cleaned'] = tokenized_tweet
df.head()


Unnamed: 0,sentiment,text,text_cleaned
0,1,Top 3 all @Apple #tablets. Damn right! http://...,top #tablet damn right
1,1,CNBCTV: #Apple's margins better than expected?...,cnbctv #appl margin better expect #aapl
2,0,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,wtf batteri one second ago wtf
3,1,RT @peterpham: Bought my @AugustSmartLock at t...,rt bought store pretti good logo match wait in...
4,0,@apple Contact sync between Yosemite and iOS8 ...,contact sync yosemit io serious screw use much...


In [10]:

df.to_csv('data_development_cleaned.csv', index=False)
