# Business Problem

Decoding the responses of products displayed at South by South West (SXSW) and seeing which product was seen to be the most favorable.

# Importing and Data Read in

In [181]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report

import nltk
nltk.download("stopwords")
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import multidict as multidict
import os
import re
from PIL import Image
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [145]:
#read in tweets from data, about 9000 rows and three columns
df = pd.read_csv("data/twitter_data.csv", encoding="unicode_escape")

In [146]:
#dropped one tweet that had a NA value (row 6)
df = df.loc[df["tweet_text"].notna()]

In [147]:
def preprocess_text(tweet):
    tweet = tweet.lower()
    tweet = RegexpTokenizer(r"(?u)\w{3,}").tokenize(tweet)
    tweet = [word for word in tweet if word not in stopwords.words("english")]
    tweet = [PorterStemmer().stem(word) for word in tweet]
    return tweet

df["preprocessed_text"] = [preprocess_text(tweet) for tweet in df["tweet_text"]]

In [148]:
def company(tweet):
    if "appl" in tweet or "iphon" in tweet or "ipad" in tweet:
        return "apple"
    elif "googl" in tweet or "android" in tweet:
        return "google"

df["predict_company"] = [company(tweet) for tweet in df["preprocessed_text"]]

In [149]:
def directed_at(tweet):
    if "app" in tweet:
        return "app"
    elif "ipad" in tweet:
        return "ipad"
    elif "iphon" in tweet:
        return "iphone"
    elif "android" in tweet:
        return "android"
    elif "googl" in tweet:
        return "google"

df["predict_directed"] = [directed_at(tweet) for tweet in df["preprocessed_text"]]

In [150]:
df["predict_company"].value_counts()

apple     5477
google    2783
Name: predict_company, dtype: int64

In [151]:
positive_wordcount = {}
for row in df.loc[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "Positive emotion"]["preprocessed_text"]:
    for word in row:
        if word in positive_wordcount:
            positive_wordcount[word] += 1
        else:
            positive_wordcount[word] = 1
            
for key in list(positive_wordcount.keys()):
    if positive_wordcount[key] < 50:
        del positive_wordcount[key]

print(len(positive_wordcount))

negative_wordcount = {}
for row in df.loc[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "Negative emotion"]["preprocessed_text"]:
    for word in row:
        if word in negative_wordcount:
            negative_wordcount[word] += 1
        else:
            negative_wordcount[word] = 1
            
for key in list(negative_wordcount.keys()):
    if negative_wordcount[key] < 10:
        del negative_wordcount[key]

print(len(negative_wordcount))

88
104


In [175]:
def emotion(tweet):
    if ("great" in tweet or "awesom" in tweet or "like" in tweet or "appreci" in tweet or "good" in tweet 
            or "beauti" in tweet or "smart" in tweet or "excit" in tweet or "wait" in tweet or "best" in tweet
            or "love" in tweet or "nice" in tweet or "must" in tweet or "enjoy" in tweet or "fun" in tweet
            or "rock" in tweet or "cool" in tweet): #maybe remove like
        return "Positive emotion"
    elif ("dead" in tweet or "insan" in tweet or "headach" in tweet or "long" in tweet or "fail" in tweet or "hate" in tweet
            or "suck" in tweet or "fascist" in tweet or "fade" in tweet or "crashi" in tweet):
        return "Negative emotion"
    
df["predict_emotion"] = [emotion(tweet) for tweet in df["preprocessed_text"]]
df["predict_emotion"].value_counts()

Positive emotion    1732
Negative emotion     196
Name: predict_emotion, dtype: int64

In [176]:
def positive_normalize(emotion):
    if emotion == "Positive emotion":
        return 1
    else:
        return 0
def negative_normalize(emotion):        
    if emotion == "Negative emotion":
        return 1
    else:
        return 0
        
df["positive"] = [positive_normalize(emotion) for emotion in df["is_there_an_emotion_directed_at_a_brand_or_product"]]
df["negative"] = [negative_normalize(emotion) for emotion in df["is_there_an_emotion_directed_at_a_brand_or_product"]]

df["predict_positive"] = [positive_normalize(emotion) for emotion in df["predict_emotion"]]
df["predict_negative"] = [negative_normalize(emotion) for emotion in df["predict_emotion"]]

In [183]:
print(classification_report(df["positive"], df["predict_positive"]))

              precision    recall  f1-score   support

           0       0.73      0.87      0.79      6114
           1       0.56      0.32      0.41      2978

    accuracy                           0.69      9092
   macro avg       0.64      0.60      0.60      9092
weighted avg       0.67      0.69      0.67      9092



In [185]:
recall_score(df["positive"], df["predict_positive"])

0.3247145735392881

In [187]:
precision_score(df["positive"], df["predict_positive"])

0.5583140877598153

In [184]:
print(classification_report(df["negative"], df["predict_negative"]))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      8522
           1       0.46      0.16      0.24       570

    accuracy                           0.94      9092
   macro avg       0.71      0.57      0.60      9092
weighted avg       0.92      0.94      0.92      9092



In [186]:
recall_score(df["negative"], df["predict_negative"])

0.15964912280701754

In [188]:
precision_score(df["negative"], df["predict_negative"])

0.4642857142857143

In [15]:
df["emotion_in_tweet_is_directed_at"].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [16]:
df["emotion_in_tweet_is_directed_at"].isna().value_counts()

True     5801
False    3291
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [64]:
df["is_there_an_emotion_directed_at_a_brand_or_product"].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [100]:
df["predict_emotion"].isna().value_counts()

True     7363
False    1729
Name: predict_emotion, dtype: int64

In [174]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,preprocessed_text,predict_company,predict_directed,predict_emotion,positive,negative,predict_positive,predict_negative
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[wesley83, iphon, hr, tweet, rise_austin, dead...",apple,iphone,Negative emotion,0,1,0,1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[jessede, know, fludapp, awesom, ipad, iphon, ...",apple,app,Positive emotion,1,0,1,0
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[swonderlin, wait, ipad, also, sale, sxsw]",apple,ipad,Positive emotion,1,0,1,0
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[sxsw, hope, year, festiv, crashi, year, iphon...",apple,app,,0,1,0,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[sxtxstate, great, stuff, fri, sxsw, marissa, ...",google,google,Positive emotion,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,"[ipad, everywher, sxsw, link]",apple,ipad,,1,0,0,0
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,"[wave, buzz, mention, interrupt, regularli, sc...",google,google,,0,0,0,0
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,"[googl, zeiger, physician, never, report, pote...",google,google,,0,0,0,0
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,"[verizon, iphon, custom, complain, time, fell,...",apple,iphone,,0,0,0,0


# APPENDIX

In [19]:
#checks how many tweets are retweets (a tweet was reposted by another user). 
#There were 2677 retweets
len([row for row in df["tweet_text"] if "RT" in row])

#I think this tweet is interesting as its marked as positive but feels like it could be either
print(df['tweet_text'][9088])
print(df['is_there_an_emotion_directed_at_a_brand_or_product'][9088])
#or like this tweet where its marked as positive but its just remarking on the weather
print(df["tweet_text"][40])
print(df['is_there_an_emotion_directed_at_a_brand_or_product'][40])

Ipad everywhere. #SXSW {link}
Positive emotion
@mention  - Great weather to greet you for #sxsw! Still need a sweater at night..Apple putting up &quot;flash store&quot; downtown to sell iPad2
Positive emotion


In [20]:
corpus = []
for row in df["tweet_text"]:
    corpus += [row + ","]

vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=stopwords.words("english"))
X = vec.fit_transform(corpus)
df_cv = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df_cv

Unnamed: 0,aapl,aaron,aarpbulletin,ab,abacus,abandoned,abba,abc,aber,abilities,...,zms,zomb,zombie,zombies,zomg,zone,zoom,zuckerberg,zynga,zzzs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9087,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9088,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9089,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9090,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
neutral_wordcount = {}
for row in (df.loc[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "No emotion toward brand or product"]
            ["preprocessed_text"]):
    for word in row:
        if word in neutral_wordcount:
            neutral_wordcount[word] += 1
        else:
            neutral_wordcount[word] = 1
for row in (df.loc[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "I can't tell"]["preprocessed_text"]):
    for word in row:
        if word in neutral_wordcount:
            neutral_wordcount[word] += 1
        else:
            neutral_wordcount[word] = 1

            
for key in list(neutral_wordcount.keys()):
    if neutral_wordcount[key] < 50:
        del neutral_wordcount[key]

print(len(neutral_wordcount))
neutral_wordcount

148


{'new': 688,
 'ipad': 1347,
 'app': 477,
 'sxsw': 5898,
 'itun': 71,
 'store': 918,
 'via': 290,
 'mention': 4624,
 'see': 150,
 'android': 365,
 'link': 3002,
 'anyon': 94,
 'want': 107,
 'sell': 55,
 'googl': 1784,
 'launch': 612,
 'major': 235,
 'social': 513,
 'network': 377,
 'call': 313,
 'circl': 529,
 'possibl': 202,
 'today': 449,
 'play': 53,
 'music': 113,
 'amp': 603,
 'com': 63,
 'appl': 1280,
 'mobil': 239,
 'updat': 73,
 'iphon': 879,
 'blackberri': 69,
 'get': 291,
 'friend': 57,
 'hey': 61,
 'think': 107,
 'take': 75,
 'make': 147,
 'case': 63,
 'use': 156,
 'one': 147,
 'video': 75,
 'includ': 73,
 'set': 128,
 'say': 120,
 'quot': 1062,
 'platform': 56,
 'help': 57,
 'sxswi': 234,
 'locat': 108,
 'futur': 69,
 'connect': 56,
 'digit': 67,
 'world': 54,
 'tweet': 102,
 'panel': 105,
 'check': 193,
 'guy': 105,
 'talk': 131,
 'know': 140,
 'search': 109,
 'pop': 377,
 'austin': 651,
 'would': 68,
 'interest': 51,
 'wait': 64,
 'buy': 68,
 'join': 55,
 'give': 77,
 'nee