# Business Problem

Decoding the responses of products displayed at South by South West (SXSW) and seeing which product was seen to be the most favorable.

# Importing and Data Read in

In [165]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download("stopwords")
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
#read in tweets from data, about 9000 rows and three columns
df = pd.read_csv("data/twitter_data.csv", encoding="unicode_escape")

In [84]:
#dropped one tweet that had a NA value (row 6)
df = df.loc[df["tweet_text"].notna()]

In [85]:
def preprocess_text(tweet):
    tweet = tweet.lower()
    tweet = RegexpTokenizer(r"(?u)\w{3,}").tokenize(tweet)
    tweet = [word for word in tweet if word not in stopwords.words("english")]
    tweet = [PorterStemmer().stem(word) for word in tweet]
    return tweet

df["preprocessed_text"] = [preprocess_text(tweet) for tweet in df["tweet_text"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["preprocessed_text"] = [preprocess_text(tweet) for tweet in df["tweet_text"]]


In [86]:
def company(tweet):
    if "appl" in tweet or "iphon" in tweet or "ipad" in tweet:
        return "apple"
    elif "googl" in tweet or "android" in tweet:
        return "google"

df["predict_company"] = [company(tweet) for tweet in df["preprocessed_text"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["predict_company"] = [company(tweet) for tweet in df["preprocessed_text"]]


In [87]:
def directed_at(tweet):
    if "app" in tweet:
        return "app"
    elif "ipad" in tweet:
        return "ipad"
    elif "iphon" in tweet:
        return "iphone"
    elif "android" in tweet:
        return "android"
    elif "googl" in tweet:
        return "google"

df["predict_directed"] = [directed_at(tweet) for tweet in df["preprocessed_text"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["predict_directed"] = [directed_at(tweet) for tweet in df["preprocessed_text"]]


In [290]:
df["predict_company"].value_counts()

apple     5477
google    2783
Name: predict_company, dtype: int64

In [284]:
def emotion(tweet):
    if ("great" in tweet or "awesome" in tweet or "like" in tweet or "appreci" in tweet or "good" in tweet 
            or "beauti" in tweet or "smart" in tweet or "excit" in tweet or "wait" in tweet or "best" in tweet
            or "love" in tweet or "nice" in tweet or "must" in tweet or "need" in tweet or "enjoy" in tweet
            or "rock" in tweet or "best" in tweet or "win" in tweet or "" in tweet): #maybe "launch" or "open" or "check"
        return "Positive emotion"
    elif ("dead" in tweet or "insan" in tweet or "headach" in tweet):
        return "Negative emotion"
df["predict_emotion"] = [emotion(tweet) for tweet in df["preprocessed_text"]]
df["predict_emotion"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["predict_emotion"] = [emotion(tweet) for tweet in df["preprocessed_text"]]


Positive emotion    1865
Negative emotion      54
Name: predict_emotion, dtype: int64

In [287]:
print(df["tweet_text"][75])
print(df['preprocessed_text'][75])
print(df['is_there_an_emotion_directed_at_a_brand_or_product'][75])
#print(df["emotion_in_tweet_is_directed_at"][17])

Holla! RT @mention At google party. Best ever! Get your butt over here. #sxsw
['holla', 'mention', 'googl', 'parti', 'best', 'ever', 'get', 'butt', 'sxsw']
Positive emotion


In [13]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,preprocessed_text,predict_company,predict_directed
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[wesley83, iphon, hr, tweet, rise_austin, dead...",apple,iphone
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[jessede, know, fludapp, awesom, ipad, iphon, ...",apple,app
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[swonderlin, wait, ipad, also, sale, sxsw]",apple,ipad
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[sxsw, hope, year, festiv, crashi, year, iphon...",apple,app
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[sxtxstate, great, stuff, fri, sxsw, marissa, ...",google,
...,...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,"[ipad, everywher, sxsw, link]",apple,ipad
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,"[wave, buzz, mention, interrupt, regularli, sc...",google,
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,"[googl, zeiger, physician, never, report, pote...",google,
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,"[verizon, iphon, custom, complain, time, fell,...",apple,iphone


In [34]:
df["emotion_in_tweet_is_directed_at"].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [35]:
df["emotion_in_tweet_is_directed_at"].isna().value_counts()

True     5801
False    3291
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [36]:
df["predict_directed"].isna().value_counts()

False    6810
True     2282
Name: predict_directed, dtype: int64

# APPENDIX

In [231]:
#checks how many tweets are retweets (a tweet was reposted by another user). 
#There were 2677 retweets
len([row for row in df["tweet_text"] if "RT" in row])

#I think this tweet is interesting as its marked as positive but feels like it could be either
print(df['tweet_text'][9088])
print(df['is_there_an_emotion_directed_at_a_brand_or_product'][9088])
#or like this tweet where its marked as positive but its just remarking on the weather
print(df["tweet_text"][40])
print(df['is_there_an_emotion_directed_at_a_brand_or_product'][40])

Ipad everywhere. #SXSW {link}
Positive emotion
@mention  - Great weather to greet you for #sxsw! Still need a sweater at night..Apple putting up &quot;flash store&quot; downtown to sell iPad2
Positive emotion


In [232]:
corpus = []
for row in df["tweet_text"]:
    corpus += [row + ","]

vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=stopwords.words("english"))
X = vec.fit_transform(corpus)
df_cv = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df_cv

Unnamed: 0,aapl,aaron,aarpbulletin,ab,abacus,abandoned,abba,abc,aber,abilities,...,zms,zomb,zombie,zombies,zomg,zone,zoom,zuckerberg,zynga,zzzs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9087,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9088,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9089,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9090,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
