In [1]:
# import standard libraries
import numpy as np
import pandas as pd
import os
import random
from sklearn import preprocessing

In [2]:
os.chdir(os.path.join(os.getcwd(), "..", "..", "data", "raw"))

In [3]:
# read the data
df = pd.read_csv("tweet_product_company.csv", encoding = "ISO-8859-1")

In [4]:
# rename the columns
df.columns = ["tweet", "product", "emotion"]

In [5]:
# get the head
df.head(10)

Unnamed: 0,tweet,product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [6]:
# get the tail
df.tail(10)

Unnamed: 0,tweet,product,emotion
9083,"Google says the future is all around you! (ie,...",,No emotion toward brand or product
9084,"Google says the future is location, location, ...",,No emotion toward brand or product
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion
9086,Google says: want to give a lightning talk to ...,,No emotion toward brand or product
9087,"@mention Yup, but I don't have a third app yet...",,No emotion toward brand or product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


In [7]:
# null count
df.isnull().sum()

tweet         1
product    5802
emotion       0
dtype: int64

In [8]:
# drop the product column: not necesseary for sentiment analysis
df = df.drop(["product"], axis=1)

In [9]:
df.columns

Index(['tweet', 'emotion'], dtype='object')

In [10]:
# null count
df.isnull().sum()

tweet      1
emotion    0
dtype: int64

In [11]:
# drop the column with null values
df = df.dropna()

In [12]:
# reset the index
df = df.reset_index(drop=True)

In [13]:
# null count
df.isnull().sum()

tweet      0
emotion    0
dtype: int64

In [14]:
# get the shape
df.shape

(9092, 2)

In [15]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [16]:
# get the shape
df.shape

(9070, 2)

In [18]:
# get the unique emotions
old_emotions = list(np.unique(df["emotion"]))
old_emotions

["I can't tell",
 'Negative emotion',
 'No emotion toward brand or product',
 'Positive emotion']

In [19]:
df.head()

Unnamed: 0,tweet,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [20]:
# set a neutral emotion for:
# "I can't tell" -> "Neutral emotion"
# "No emotion toward brand or product" -> "Neutral emotion"
for index in range(len(df)):
    emotion = df["emotion"][index]
    if (emotion == old_emotions[0] or emotion == old_emotions[2]):
        df["emotion"][index] = "Neutral emotion"

In [21]:
# use label encoding for emotions
# 0 -> Negative emotion
# 1 -> Neutral emotion
# 2 -> Positive emotion
lb_make = preprocessing.LabelEncoder()
df['emotion'] = lb_make.fit_transform(df['emotion'])

In [24]:
# get the head after label enconding
df.head(10)

Unnamed: 0,tweet,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,2
2,@swonderlin Can not wait for #iPad 2 also. The...,2
3,@sxsw I hope this year's festival isn't as cra...,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,2
5,@teachntech00 New iPad Apps For #SpeechTherapy...,1
6,"#SXSW is just starting, #CTIA is around the co...",2
7,Beautifully smart and simple idea RT @madebyma...,2
8,Counting down the days to #sxsw plus strong Ca...,2
9,Excited to meet the @samsungmobileus at #sxsw ...,2


In [25]:
# save the dataframe back for next step: text preprocessing
os.chdir(os.path.join(os.getcwd(), "..", "preprocessed"))
df.to_csv("2.0-sh-data-preprocessed.csv", index=False)