In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings("ignore")

In [73]:
df = pd.read_csv("tweet_product_company.csv", encoding="latin-1")
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [74]:
df.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


In [75]:
print(f"This dataset has {df.shape[0]} records and {df.shape[1]} columns.")

This dataset has 9093 records and 3 columns.


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [77]:
df.describe().T

Unnamed: 0,count,unique,top,freq
tweet_text,9092,9065,RT @mention Marissa Mayer: Google Will Connect...,5
emotion_in_tweet_is_directed_at,3291,9,iPad,946
is_there_an_emotion_directed_at_a_brand_or_product,9093,4,No emotion toward brand or product,5389


In [78]:
df.isna().sum() *100 /len(df)

tweet_text                                             0.010997
emotion_in_tweet_is_directed_at                       63.807324
is_there_an_emotion_directed_at_a_brand_or_product     0.000000
dtype: float64

In [81]:
df.emotion_in_tweet_is_directed_at.value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [82]:
df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [86]:
def standardize_tweet(text):
        """
        This function takes a tweet as input and performs several cleaning steps
        to return a standardized string.
        """
        if not isinstance(text, str):
            return "" # Return an empty string if the input is not text (e.g., a missing value)

        # Convert all text to lowercase to ensure consistency ('Apple' and 'apple' are treated the same).
        text = text.lower()

        # Remove all @mentions (e.g., "@mention", "@wesley83")
        text = re.sub(r'@\w+', '', text)

        # Remove all #hashtags (e.g., "#SXSW", "#googleio")
        text = re.sub(r'#\w+', '', text)

        # Remove URLs and the placeholder "{link}"
        text = re.sub(r'http\S+|{link}', '', text)

        # Remove special characters, punctuation, and numbers, keeping only letters and spaces.
        text = re.sub(r'[^a-z\s]', '', text)

        # Remove any extra whitespace (e.g., multiple spaces between words) that may result from the above steps.
        text = ' '.join(text.split())

        return text
    
df['standardized_tweet'] = df['tweet_text'].apply(standardize_tweet)   

In [87]:
df['standardized_tweet']

0       i have a g iphone after hrs tweeting at it was...
1       know about awesome ipadiphone app that youll l...
2       can not wait for also they should sale them do...
3       i hope this years festival isnt as crashy as t...
4       great stuff on fri marissa mayer google tim or...
                              ...                        
9088                                      ipad everywhere
9089    wave buzz rt we interrupt your regularly sched...
9090    googles zeiger a physician never reported pote...
9091    some verizon iphone customers complained their...
9092                    rt google tests checkin offers at
Name: standardized_tweet, Length: 9093, dtype: object

In [88]:
df.duplicated().sum()

22

In [89]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [90]:
def categorize_products(product_name):
    """
    Categorize product names into main brands
    """
    if pd.isna(product_name):
        return product_name
    
    product_lower = str(product_name).lower()
    
    # Google products
    if any(keyword in product_lower for keyword in ['google', 'other google product', 'android', 'android app']):
        return 'Google'
    
    # Apple products  
    elif any(keyword in product_lower for keyword in ['apple', 'ipad', 'iphone', 'other apple product']):
        return 'Apple'
    
    # Keep original if no match
    else:
        return product_name

# Apply the function
df['emotion_in_tweet_is_directed_at'] = df['emotion_in_tweet_is_directed_at'].apply(categorize_products)

# Check results
print("Standardized value counts:")
print(df['emotion_in_tweet_is_directed_at'].value_counts())

Standardized value counts:
Apple     2404
Google     878
Name: emotion_in_tweet_is_directed_at, dtype: int64


In [91]:
df1 = df.copy()
df1

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,standardized_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,Negative emotion,i have a g iphone after hrs tweeting at it was...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,Positive emotion,know about awesome ipadiphone app that youll l...
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,Positive emotion,can not wait for also they should sale them do...
3,@sxsw I hope this year's festival isn't as cra...,Apple,Negative emotion,i hope this years festival isnt as crashy as t...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,great stuff on fri marissa mayer google tim or...
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},Apple,Positive emotion,ipad everywhere
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,wave buzz rt we interrupt your regularly sched...
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,googles zeiger a physician never reported pote...
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,some verizon iphone customers complained their...


In [92]:
df1.dropna(subset=['tweet_text'], inplace=True)

In [93]:
df1['tweet_text'].isna().sum()

0

In [94]:
df1.shape

(9070, 4)

In [95]:
def standardize_tweet(text):
        """
        This function takes a tweet as input and performs several cleaning steps
        to return a standardized string.
        """
        if not isinstance(text, str):
            return "" # Return an empty string if the input is not text (e.g., a missing value)

        # Convert all text to lowercase to ensure consistency ('Apple' and 'apple' are treated the same).
        text = text.lower()

        # Remove all @mentions (e.g., "@mention", "@wesley83")
        text = re.sub(r'@\w+', '', text)

        # Remove all #hashtags (e.g., "#SXSW", "#googleio")
        text.replace('#', '')
        #text = re.sub(r'#\w+',r'\1', text)

        # Remove URLs and the placeholder "{link}"
        text = re.sub(r'http\S+|{link}', '', text)

        # Remove special characters, punctuation, and numbers, keeping only letters and spaces.
        text = re.sub(r'[^a-z\s]', '', text)

        # Remove any extra whitespace (e.g., multiple spaces between words) that may result from the above steps.
        text = ' '.join(text.split())

        return text
    
df1['standardized_tweet'] = df1['tweet_text'].apply(standardize_tweet) 

In [96]:
df1['standardized_tweet']

0       i have a g iphone after hrs tweeting at riseau...
1       know about awesome ipadiphone app that youll l...
2       can not wait for ipad also they should sale th...
3       i hope this years festival isnt as crashy as t...
4       great stuff on fri sxsw marissa mayer google t...
                              ...                        
9088                                 ipad everywhere sxsw
9089    wave buzz rt we interrupt your regularly sched...
9090    googles zeiger a physician never reported pote...
9091    some verizon iphone customers complained their...
9092               rt google tests checkin offers at sxsw
Name: standardized_tweet, Length: 9070, dtype: object

In [109]:
# --- Step 2: Define the keywords for classification ---
apple_keywords = ['apple', 'iphone', 'ipad', 'macbook', 'ios']
google_keywords = ['google', 'android']

# --- Step 3: Create the classification function ---
def classify_brand(row):
    """
    This function checks a tweet for brand keywords and classifies it.
    It returns 'Apple', 'Google', 'Both', or 'None'.
    """
    tweet = row['standardized_tweet']
    emotion_target = str(row['emotion_in_tweet_is_directed_at']).lower() if pd.notna(row['emotion_in_tweet_is_directed_at']) else ''
    
    # We handle cases where a row might be empty or not a string
    if not isinstance(tweet, str):
        return 'None'

    # Check if any keyword from our lists exists in the tweet
    apple_present = any(keyword in tweet.lower() for keyword in apple_keywords)
    google_present = any(keyword in tweet.lower() for keyword in google_keywords)

    # Apply the classification logic
    if apple_present and google_present:
        # --- Nested if-statement for the tie-breaker ---
        # This block only runs if both brands are mentioned in the tweet text.
        if any(keyword in emotion_target for keyword in ['ipad', 'iphone', 'apple']):
            return 'Apple'
        elif any(keyword in emotion_target for keyword in ['google', 'android']):
            return 'Google'
        else:
            return 'Both'
    elif apple_present:
        return 'Apple'
    elif google_present:
        return 'Google'
    else:
        if any(keyword in emotion_target for keyword in apple_keywords):
            return 'Apple'
        elif any(keyword in emotion_target for keyword in google_keywords):
            return 'Google'
        else:
            return 'None'
    
# --- Step 4: Apply the function to create the new column ---
# Use axis=1 to apply the function to each row (access to all columns)
df1['brand_classification'] = df1.apply(classify_brand, axis=1)

In [99]:
df1

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,standardized_tweet,brand_classification
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,Negative emotion,i have a g iphone after hrs tweeting at riseau...,Apple
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,Positive emotion,know about awesome ipadiphone app that youll l...,Apple
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,Positive emotion,can not wait for ipad also they should sale th...,Apple
3,@sxsw I hope this year's festival isn't as cra...,Apple,Negative emotion,i hope this years festival isnt as crashy as t...,Apple
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,great stuff on fri sxsw marissa mayer google t...,Google
...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},Apple,Positive emotion,ipad everywhere sxsw,Apple
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,wave buzz rt we interrupt your regularly sched...,Google
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,googles zeiger a physician never reported pote...,Google
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,some verizon iphone customers complained their...,Apple


In [100]:
# Filter rows where brand classification column is not null and not empty
brand_rows = df1[df1['brand_classification'] == 'Both']
brand_rows['emotion_in_tweet_is_directed_at'].value_counts()

Series([], Name: emotion_in_tweet_is_directed_at, dtype: int64)

In [107]:
brand_rows = df1[df1['brand_classification'] == 'None']
brand_rows['emotion_in_tweet_is_directed_at'].value_counts()

Series([], Name: emotion_in_tweet_is_directed_at, dtype: int64)

In [106]:
brand_rows = df1[df1['brand_classification'] == 'Both']
brand_rows

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,standardized_tweet,brand_classification
39,VatorNews - Google And Apple Force Print Media...,,No emotion toward brand or product,vatornews google and apple force print media t...,Both
41,HootSuite - HootSuite Mobile for #SXSW ~ Updat...,,No emotion toward brand or product,hootsuite hootsuite mobile for sxsw updates fo...,Both
68,Boooo! RT @mention Flipboard is developing an ...,,Negative emotion,boooo rt flipboard is developing an iphone ver...,Both
254,iPad 2 vs Android vs The World : Panel at #SXS...,,No emotion toward brand or product,ipad vs android vs the world panel at sxsw tod...,Both
366,Companies believing in #NFC Google with the Ne...,,No emotion toward brand or product,companies believing in nfc google with the nex...,Both
...,...,...,...,...,...
8818,"&quot;SXSW GO is now available iPhone, iPad, A...",,No emotion toward brand or product,quotsxsw go is now available iphone ipad andro...,Both
8825,"Free #sxsw Lemonade Stand. iPhone, Android, an...",,No emotion toward brand or product,free sxsw lemonade stand iphone android and wp...,Both
8929,Clearly #wp7dev won the mobile smackdown v iPh...,,No emotion toward brand or product,clearly wpdev won the mobile smackdown v iphon...,Both
8965,Apparently #Apple has cornered the live demo m...,,No emotion toward brand or product,apparently apple has cornered the live demo ma...,Both


In [110]:
df1 = df1.drop('emotion_in_tweet_is_directed_at', axis=1)
df1

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,standardized_tweet,brand_classification
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,i have a g iphone after hrs tweeting at riseau...,Apple
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,know about awesome ipadiphone app that youll l...,Apple
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,can not wait for ipad also they should sale th...,Apple
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,i hope this years festival isnt as crashy as t...,Apple
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,great stuff on fri sxsw marissa mayer google t...,Google
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},Positive emotion,ipad everywhere sxsw,Apple
9089,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product,wave buzz rt we interrupt your regularly sched...,Google
9090,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product,googles zeiger a physician never reported pote...,Google
9091,Some Verizon iPhone customers complained their...,No emotion toward brand or product,some verizon iphone customers complained their...,Apple


In [111]:
df1 = df1[df1["brand_classification"] != "None"]
df1["brand_classification"].value_counts()

Apple     5341
Google    2770
Both       199
Name: brand_classification, dtype: int64