In [61]:
import pandas as pd
import numpy as np
import re
import warnings
from pandas.api.types import is_string_dtype, is_numeric_dtype
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, word_tokenize
import string
from nltk.probability import FreqDist
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [62]:
df_main = pd.read_csv("data/main.csv", encoding='latin-1')
df_main.drop(df_main.index[-1], inplace=True)
df_main

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9087,"@mention Yup, but I don't have a third app yet...",,No emotion toward brand or product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product


In [63]:
# there are 5801 Null values in the 'emotion_in_tweet_is_directed_at' column
df_main['emotion_in_tweet_is_directed_at'].isnull().sum()

5801

In [64]:
# This class will parse the tweets and accurately fill the 'emotion_in_tweet_is_directed_at' column, from the list
class DataFrameProcessor:
    def __init__(self):
        self.product_list = ['ipad', 'apple', 'iphone', 'google', 'android']

    def process_dataframe(self, df):
        df = df.applymap(lambda s:s.lower() if type(s) == str else s)
        for product in self.product_list:
            df[product] = df['tweet_text'].apply(lambda x: 1 if (isinstance(x, str) and product in x) else 0)
        df['emotion_in_tweet_is_directed_at'] = df.apply(self.update_product_column, axis=1)
        df['emotion_in_tweet_is_directed_at'] = df['emotion_in_tweet_is_directed_at'].apply(self.remove_duplicates)
        df['emotion_in_tweet_is_directed_at'] = df['emotion_in_tweet_is_directed_at'].apply(self.process_emotion_column)
        return df

    def update_product_column(self, row):
        current_value = row['emotion_in_tweet_is_directed_at']
        if pd.isna(current_value):
            current_value = ''
        for product in self.product_list:
            if row[product] == 1:
                if current_value:
                    current_value = current_value + ', ' + product
                else:
                    current_value = product
        return current_value

    def remove_duplicates(self, s):
        words = s.split(', ')
        unique_words = set(words)
        return ', '.join(unique_words)

    def process_emotion_column(self, s):
        words = s.split(', ')
        unique_words = list(set([word for word in words if word in self.product_list]))
        return ', '.join(unique_words)

In [65]:
df_Filled =df_main.copy()
processor = DataFrameProcessor()
df_Filled = processor.process_dataframe(df_Filled)
df_Filled['emotion_in_tweet_is_directed_at'].isnull().sum()

0

In [66]:
df_Filled

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,ipad,apple,iphone,google,android
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,iphone,negative emotion,0,0,1,0,0
1,@jessedee know about @fludapp ? awesome ipad/i...,"ipad, iphone",positive emotion,1,0,1,0,0
2,@swonderlin can not wait for #ipad 2 also. the...,ipad,positive emotion,1,0,0,0,0
3,@sxsw i hope this year's festival isn't as cra...,iphone,negative emotion,0,0,1,0,0
4,@sxtxstate great stuff on fri #sxsw: marissa m...,google,positive emotion,0,0,0,1,0
...,...,...,...,...,...,...,...,...
9087,"@mention yup, but i don't have a third app yet...",android,no emotion toward brand or product,0,0,0,0,1
9088,ipad everywhere. #sxsw {link},ipad,positive emotion,1,0,0,0,0
9089,"wave, buzz... rt @mention we interrupt your re...",google,no emotion toward brand or product,0,0,0,1,0
9090,"google's zeiger, a physician never reported po...",google,no emotion toward brand or product,0,0,0,1,0


In [67]:
value_counts_df_origin = df_main['emotion_in_tweet_is_directed_at'].value_counts().reset_index()
value_counts_df_origin.columns = ['emotion', 'count']
value_counts_df_origin

Unnamed: 0,emotion,count
0,iPad,946
1,Apple,661
2,iPad or iPhone App,470
3,Google,430
4,iPhone,297
5,Other Google product or service,293
6,Android App,81
7,Android,78
8,Other Apple product or service,35


In [68]:
value_counts_df_origin['count'].sum()

3291

In [69]:
df_main['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [70]:
df = df_main.copy()
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9087,"@mention Yup, but I don't have a third app yet...",,No emotion toward brand or product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product


In [71]:
# For visualiztion and EDA, let's make ne dataframes containing only positive, negative, or neutral
df_pos = df[df['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Positive emotion'].copy()
df_pos.to_csv('positive.csv', index=False)
df_neg = df[df['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Negative emotion'].copy()
df_neg.to_csv('negative.csv', index=False)
df_neut = df[df['is_there_an_emotion_directed_at_a_brand_or_product'] == 'No emotion toward brand or product'].copy()
df_neut.to_csv('neutral.csv', index=False)

In [72]:
df_pos = pd.read_csv("data/positive.csv", encoding='latin-1')
df_neg = pd.read_csv("data/negative.csv", encoding='latin-1')
df_neut = pd.read_csv("data/neutral.csv", encoding='latin-1')

In [73]:
df_pos

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
1,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
2,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
3,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
4,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
...,...,...,...
2973,@mention your iPhone 4 cases are Rad and Ready...,iPhone,Positive emotion
2974,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
2975,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
2976,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion


In [74]:
df_neg

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
2,I just noticed DST is coming this weekend. How...,iPhone,Negative emotion
3,@mention - False Alarm: Google Circles Not Co...,Google,Negative emotion
4,Again? RT @mention Line at the Apple store is ...,,Negative emotion
...,...,...,...
565,Google guy at #sxsw talk is explaining how he ...,,Negative emotion
566,I think my effing hubby is in line for an #iPa...,iPad,Negative emotion
567,I'm pretty sure the panelist that thinks &quot...,Apple,Negative emotion
568,Hey is anyone doing #sxsw signing up for the g...,,Negative emotion


In [75]:
df_neut

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
1,,,No emotion toward brand or product
2,Holler Gram for iPad on the iTunes App Store -...,,No emotion toward brand or product
3,"Attn: All #SXSW frineds, @mention Register fo...",,No emotion toward brand or product
4,Anyone at #sxsw want to sell their old iPad?,,No emotion toward brand or product
...,...,...,...
5384,"@mention Yup, but I don't have a third app yet...",,No emotion toward brand or product
5385,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
5386,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
5387,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


# NLP

In [76]:
sample_document = df.iloc[1].tweet_text
sample_document

"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW"

In [77]:
sample_document.split(' ')

['@jessedee',
 'Know',
 'about',
 '@fludapp',
 '?',
 'Awesome',
 'iPad/iPhone',
 'app',
 'that',
 "you'll",
 'likely',
 'appreciate',
 'for',
 'its',
 'design.',
 'Also,',
 "they're",
 'giving',
 'free',
 'Ts',
 'at',
 '#SXSW']

In [78]:
re.findall(r"([a-zA-Z]+(?:'[a-z]+)?)", "I'd")

["I'd"]

In [79]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenizer = RegexpTokenizer(pattern)
sample_doc = tokenizer.tokenize(sample_document)

In [80]:
sample_document

"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW"

In [81]:
sw = stopwords.words('english')

In [82]:
sample_doc = [token.lower() for token in sample_doc]
sample_doc = [token for token in sample_doc if token not in sw]

In [83]:
sample_doc

['jessedee',
 'know',
 'fludapp',
 'awesome',
 'ipad',
 'iphone',
 'app',
 'likely',
 'appreciate',
 'design',
 'also',
 "they're",
 'giving',
 'free',
 'ts',
 'sxsw']

In [84]:
def clean_text(text):
    # Make text lowercase
    text = text.lower()
    # Remove punctuation
    text = "".join([word for word in text if word not in string.punctuation])
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    text = [word for word in tokens if word not in stopwords.words('english')]
    return text


In [85]:
df['tweet_text'] = df['tweet_text'].astype(str)
df['tweet_text'] = df['tweet_text'].str.replace('/', ' ')

In [86]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: clean_text(x))

In [87]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,"[wesley83, 3g, iphone, 3, hrs, tweeting, risea...",iPhone,Negative emotion
1,"[jessedee, know, fludapp, awesome, ipad, iphon...",iPad or iPhone App,Positive emotion
2,"[swonderlin, wait, ipad, 2, also, sale, sxsw]",iPad,Positive emotion
3,"[sxsw, hope, years, festival, isnt, crashy, ye...",iPad or iPhone App,Negative emotion
4,"[sxtxstate, great, stuff, fri, sxsw, marissa, ...",Google,Positive emotion
...,...,...,...
9087,"[mention, yup, dont, third, app, yet, im, andr...",,No emotion toward brand or product
9088,"[ipad, everywhere, sxsw, link]",iPad,Positive emotion
9089,"[wave, buzz, rt, mention, interrupt, regularly...",,No emotion toward brand or product
9090,"[googles, zeiger, physician, never, reported, ...",,No emotion toward brand or product


In [88]:
df1 = df.copy()

In [89]:
df1

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,"[wesley83, 3g, iphone, 3, hrs, tweeting, risea...",iPhone,Negative emotion
1,"[jessedee, know, fludapp, awesome, ipad, iphon...",iPad or iPhone App,Positive emotion
2,"[swonderlin, wait, ipad, 2, also, sale, sxsw]",iPad,Positive emotion
3,"[sxsw, hope, years, festival, isnt, crashy, ye...",iPad or iPhone App,Negative emotion
4,"[sxtxstate, great, stuff, fri, sxsw, marissa, ...",Google,Positive emotion
...,...,...,...
9087,"[mention, yup, dont, third, app, yet, im, andr...",,No emotion toward brand or product
9088,"[ipad, everywhere, sxsw, link]",iPad,Positive emotion
9089,"[wave, buzz, rt, mention, interrupt, regularly...",,No emotion toward brand or product
9090,"[googles, zeiger, physician, never, reported, ...",,No emotion toward brand or product


In [90]:
df1['tweet_text'] = df1['tweet_text'].apply(lambda x: ' '.join(word.strip("',[]") for word in str(x).split()))


In [91]:
# df1.drop(df1.index[-1], inplace=True)
df1

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,wesley83 3g iphone 3 hrs tweeting riseaustin d...,iPhone,Negative emotion
1,jessedee know fludapp awesome ipad iphone app ...,iPad or iPhone App,Positive emotion
2,swonderlin wait ipad 2 also sale sxsw,iPad,Positive emotion
3,sxsw hope years festival isnt crashy years iph...,iPad or iPhone App,Negative emotion
4,sxtxstate great stuff fri sxsw marissa mayer g...,Google,Positive emotion
...,...,...,...
9087,mention yup dont third app yet im android sugg...,,No emotion toward brand or product
9088,ipad everywhere sxsw link,iPad,Positive emotion
9089,wave buzz rt mention interrupt regularly sched...,,No emotion toward brand or product
9090,googles zeiger physician never reported potent...,,No emotion toward brand or product


In [92]:
df1
df1.to_csv('df_CLEAN.csv', index=False)

In [93]:
# This function withh output the most frequent words, and appearance %
def get_top_words(df, column_name, top_n=25):
    df[column_name] = df[column_name].astype(str)
    all_text = ' '.join(df[column_name].values)
    words = all_text.split()

    # Create a DataFrame to store the word counts
    word_counts = pd.DataFrame({'word': words}).value_counts().reset_index()
    word_counts.columns = ['word', 'count']

    # Calculate the frequency percentage
    word_counts['frequency_percentage'] = (word_counts['count'] / len(words)) * 100

    # Select the top n words
    top_words = word_counts.head(top_n)

    return top_words[['word', 'count', 'frequency_percentage']]

top_words_df = get_top_words(df1, 'tweet_text', top_n=25)
print(top_words_df)



       word  count  frequency_percentage
0      sxsw   9429              8.342107
1   mention   6884              6.090472
2      link   4283              3.789293
3        rt   2929              2.591370
4      ipad   2454              2.171124
5    google   2367              2.094153
6     apple   2146              1.898628
7    iphone   1553              1.373984
8     store   1467              1.297897
9       new   1089              0.963470
10        2   1067              0.944006
11   austin    954              0.844031
12      app    806              0.713091
13      amp    722              0.638774
14   launch    650              0.575074
15   social    641              0.567111
16  circles    622              0.550301
17    popup    599              0.529952
18  android    578              0.511373
19    today    564              0.498987
20  network    463              0.409629
21    ipad2    459              0.406090
22      via    432              0.382203
23     line    4

In [94]:
# Now that the data is cleaned, let's seperate positive, negative, and neutral again
df_pos_CLEAN = df1[df1['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Positive emotion'].copy()
df_pos_CLEAN.to_csv('positive_CLEAN.csv', index=False)
df_neg_CLEAN = df1[df1['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Negative emotion'].copy()
df_neg_CLEAN.to_csv('negative_CLEAN.csv', index=False)
df_neut_CLEAN = df1[df1['is_there_an_emotion_directed_at_a_brand_or_product'] == 'No emotion toward brand or product'].copy()
df_neut_CLEAN.to_csv('neutral_CLEAN.csv', index=False)

In [95]:
df_pos_CLEAN = pd.read_csv("data/positive_CLEAN.csv", encoding='latin-1')
df_neg_CLEAN = pd.read_csv("data/negative.csv", encoding='latin-1')
df_neut_CLEAN = pd.read_csv("data/neutral_CLEAN.csv", encoding='latin-1')

In [96]:
top_words_df_pos = get_top_words(df_pos_CLEAN, 'tweet_text', top_n=25)
print(top_words_df)
# POSITIVE

       word  count  frequency_percentage
0      sxsw   9429              8.342107
1   mention   6884              6.090472
2      link   4283              3.789293
3        rt   2929              2.591370
4      ipad   2454              2.171124
5    google   2367              2.094153
6     apple   2146              1.898628
7    iphone   1553              1.373984
8     store   1467              1.297897
9       new   1089              0.963470
10        2   1067              0.944006
11   austin    954              0.844031
12      app    806              0.713091
13      amp    722              0.638774
14   launch    650              0.575074
15   social    641              0.567111
16  circles    622              0.550301
17    popup    599              0.529952
18  android    578              0.511373
19    today    564              0.498987
20  network    463              0.409629
21    ipad2    459              0.406090
22      via    432              0.382203
23     line    4

In [97]:
top_words_df_neg = get_top_words(df_neg_CLEAN, 'tweet_text', top_n=25)
print(top_words_df)
# NEGATIVE

       word  count  frequency_percentage
0      sxsw   9429              8.342107
1   mention   6884              6.090472
2      link   4283              3.789293
3        rt   2929              2.591370
4      ipad   2454              2.171124
5    google   2367              2.094153
6     apple   2146              1.898628
7    iphone   1553              1.373984
8     store   1467              1.297897
9       new   1089              0.963470
10        2   1067              0.944006
11   austin    954              0.844031
12      app    806              0.713091
13      amp    722              0.638774
14   launch    650              0.575074
15   social    641              0.567111
16  circles    622              0.550301
17    popup    599              0.529952
18  android    578              0.511373
19    today    564              0.498987
20  network    463              0.409629
21    ipad2    459              0.406090
22      via    432              0.382203
23     line    4

In [98]:
top_words_df_neut = get_top_words(df_neut_CLEAN, 'tweet_text', top_n=25)
print(top_words_df)
# NEUTRAL

       word  count  frequency_percentage
0      sxsw   9429              8.342107
1   mention   6884              6.090472
2      link   4283              3.789293
3        rt   2929              2.591370
4      ipad   2454              2.171124
5    google   2367              2.094153
6     apple   2146              1.898628
7    iphone   1553              1.373984
8     store   1467              1.297897
9       new   1089              0.963470
10        2   1067              0.944006
11   austin    954              0.844031
12      app    806              0.713091
13      amp    722              0.638774
14   launch    650              0.575074
15   social    641              0.567111
16  circles    622              0.550301
17    popup    599              0.529952
18  android    578              0.511373
19    today    564              0.498987
20  network    463              0.409629
21    ipad2    459              0.406090
22      via    432              0.382203
23     line    4

In [99]:
# Some of the tweets are exact duplicates, let's take a look
duplicate_rows = df1[df1.duplicated('tweet_text', keep=False)]
duplicate_rows

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
7,sxsw starting ctia around corner googleio hop ...,Android,Positive emotion
9,counting days sxsw plus strong canadian dollar...,Apple,Positive emotion
17,noticed dst coming weekend many iphone users h...,iPhone,Negative emotion
20,need buy ipad2 im austin sxsw sure ill need q ...,iPad,Positive emotion
21,oh god sxsw app ipad pure unadulterated awesom...,iPad or iPhone App,Positive emotion
...,...,...,...
8521,cc mention rt mention new ubersocial iphone ap...,,No emotion toward brand or product
8747,need buy ipad2 im austin sxsw sure ill need q ...,iPad,Positive emotion
8868,sxsw crowd swarms ipad 2 launch link via mention,iPad,Positive emotion
8869,sxsw crowd swarms ipad 2 launch link via mention,iPad,Positive emotion


In [100]:
# This code will show the number of times each duplicate tweet appears
string_counts = duplicate_rows['tweet_text'].value_counts().reset_index()
string_counts.columns = ['isolated_string', 'count']
string_counts

Unnamed: 0,isolated_string,count
0,rt mention google launch major new social netw...,13
1,rt mention marissa mayer google connect digita...,10
2,google launch major new social network called ...,8
3,win free ipad 2 webdoccom sxsw rt,7
4,rt mention rumor apple opening temporary store...,4
...,...,...
126,rt mention left white iphone 4g cab austin sxs...,2
127,really enjoying changes gowalla 30 android loo...,2
128,google launch major new social network called ...,2
129,rt mention full sxsw touchingstories presentat...,2


# Preparing for Binary Classification

In [101]:
binary_df_filtered = df1[df1['is_there_an_emotion_directed_at_a_brand_or_product'] != 'No emotion toward brand or product'].copy()

In [102]:
binary_emotion_map = {'Positive emotion': 0, 'Negative emotion': 1}
binary_df_filtered['target'] = binary_df_filtered['is_there_an_emotion_directed_at_a_brand_or_product'].map(binary_emotion_map)
binary_df_filtered['tweet_text'] = binary_df_filtered['tweet_text'].str.replace('quot', '')
binary_df_filtered

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,target
0,wesley83 3g iphone 3 hrs tweeting riseaustin d...,iPhone,Negative emotion,1.0
1,jessedee know fludapp awesome ipad iphone app ...,iPad or iPhone App,Positive emotion,0.0
2,swonderlin wait ipad 2 also sale sxsw,iPad,Positive emotion,0.0
3,sxsw hope years festival isnt crashy years iph...,iPad or iPhone App,Negative emotion,1.0
4,sxtxstate great stuff fri sxsw marissa mayer g...,Google,Positive emotion,0.0
...,...,...,...,...
9077,mention pr guy convinced switch back iphone gr...,iPhone,Positive emotion,0.0
9079,papyrussort like ipad nice lol sxsw lavelle,iPad,Positive emotion,0.0
9080,diller says google tv might run playstation xb...,Other Google product or service,Negative emotion,1.0
9085,ive always used camera iphone b c image stabil...,iPad or iPhone App,Positive emotion,0.0


# Preparing for MultiClass Classification

In [103]:
MultiClass_df_filtered = df1.copy()

In [104]:
MultiClass_emotion_map = {'Positive emotion': 0, 'Negative emotion': 1, 'No emotion toward brand or product': 2}
MultiClass_df_filtered['target'] = MultiClass_df_filtered['is_there_an_emotion_directed_at_a_brand_or_product'].map(MultiClass_emotion_map)
MultiClass_df_filtered['tweet_text'] = MultiClass_df_filtered['tweet_text'].str.replace('quot', '')

In [105]:
binary_df_filtered.drop('emotion_in_tweet_is_directed_at', axis=1, inplace=True)
MultiClass_df_filtered.drop('emotion_in_tweet_is_directed_at', axis=1, inplace=True)

In [106]:
binary_df_filtered['tweet_text'] = binary_df_filtered['tweet_text'].str.replace('quot', '')
binary_df_filtered

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,target
0,wesley83 3g iphone 3 hrs tweeting riseaustin d...,Negative emotion,1.0
1,jessedee know fludapp awesome ipad iphone app ...,Positive emotion,0.0
2,swonderlin wait ipad 2 also sale sxsw,Positive emotion,0.0
3,sxsw hope years festival isnt crashy years iph...,Negative emotion,1.0
4,sxtxstate great stuff fri sxsw marissa mayer g...,Positive emotion,0.0
...,...,...,...
9077,mention pr guy convinced switch back iphone gr...,Positive emotion,0.0
9079,papyrussort like ipad nice lol sxsw lavelle,Positive emotion,0.0
9080,diller says google tv might run playstation xb...,Negative emotion,1.0
9085,ive always used camera iphone b c image stabil...,Positive emotion,0.0


In [107]:
MultiClass_df_filtered

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,target
0,wesley83 3g iphone 3 hrs tweeting riseaustin d...,Negative emotion,1.0
1,jessedee know fludapp awesome ipad iphone app ...,Positive emotion,0.0
2,swonderlin wait ipad 2 also sale sxsw,Positive emotion,0.0
3,sxsw hope years festival isnt crashy years iph...,Negative emotion,1.0
4,sxtxstate great stuff fri sxsw marissa mayer g...,Positive emotion,0.0
...,...,...,...
9087,mention yup dont third app yet im android sugg...,No emotion toward brand or product,2.0
9088,ipad everywhere sxsw link,Positive emotion,0.0
9089,wave buzz rt mention interrupt regularly sched...,No emotion toward brand or product,2.0
9090,googles zeiger physician never reported potent...,No emotion toward brand or product,2.0


In [108]:
binary_df_filtered.dropna(subset=['tweet_text'], inplace=True)
binary_df_filtered.dropna(subset=['target'], inplace=True)
MultiClass_df_filtered.dropna(subset=['tweet_text'], inplace=True)
MultiClass_df_filtered.dropna(subset=['target'], inplace=True)

In [109]:
none_counts = binary_df_filtered.isna().sum()
none_counts

tweet_text                                            0
is_there_an_emotion_directed_at_a_brand_or_product    0
target                                                0
dtype: int64

In [112]:
binary_df_filtered.to_csv('binary_df_filtered.csv', index=False)
MultiClass_df_filtered.to_csv('MultiClass_df_filtered.csv', index=False)
# Now these dataframes are ready to train our models in the next notebook