# Sentiment Analysis of Tweets about Apple and Google Products

### Problem statement


This notebook builds an NLP model to classify sentiment in tweets directed at Apple and Google products.  


### Libraries

In [None]:
# import necessary libraries
import pandas as pd
import re
import nltk
#nltk.download("punkt")
#nltk.download("punkt_tab")
#nltk.download("wordnet")
#nltk.download("omw-1.4")
#nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


### Step 1: Loading Data

In [32]:
# Reading the CSV file with correct encoding (ISO-8859-1 works for many text datasets)
df = pd.read_csv('Data\judge-1377884607_tweet_product_company.csv', encoding='Latin-1')

# Displaying the first 5 rows of the dataset
df.head()



Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


### Step 2: Handle Missing Values

In [34]:
# Drop rows with missing text
df.dropna(subset=["tweet_text"], inplace=True)
df.fillna({'emotion_in_tweet_is_directed_at': 'Unknown'}, inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)


In [35]:
df.isna().sum()

tweet_text                                            0
emotion_in_tweet_is_directed_at                       0
is_there_an_emotion_directed_at_a_brand_or_product    0
dtype: int64

### Step 3: Basic Text Cleaning

In [36]:
def clean_tweet_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove user mentions
    text = re.sub(r"@\w+", "", text)
    # Remove hashtags
    text = re.sub(r"#\w+", "", text)
    # Remove special characters and numbers
    text = re.sub(r"[^A-Za-z\s]", "", text)
    return text

df["clean_text"] = df["tweet_text"].apply(clean_tweet_text)


### Step 4: Tokenization

In [39]:
df["tokens"] = df["clean_text"].apply(word_tokenize)

### Step 5: Stopward Removal

In [41]:
stop_words = set(stopwords.words("english"))

df["tokens"] = df["tokens"].apply(lambda x: [word for word in x if word not in stop_words])

### Step 6: Lemmatization

In [42]:
lemmatizer = WordNetLemmatizer()
df["tokens"] = df["tokens"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

### Step 7: Join Tokens Back

In [43]:
df["processed_text"] = df["tokens"].apply(lambda x: " ".join(x))

### Step 8: Vectorization (TF-IDF Example)

In [44]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["processed_text"])

print('TF - IDF shape', X.shape)

TF - IDF shape (9092, 5000)


### Step 9: Train-Test Split

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, df["is_there_an_emotion_directed_at_a_brand_or_product"], test_size=0.2, random_state=42)
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (7273, 5000)
Test size: (1819, 5000)


## Refactoring the steps above into a Pipeline

In [61]:
import pandas as pd
import re
import nltk
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Download NLTK resources (leave commented if already downloaded)
# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download("wordnet")
# nltk.download("omw-1.4")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# -------------------------------
# Custom Preprocessor
# -------------------------------

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def clean_text(self,text):
        text = re.sub(r"http\S+|www\S+|https\S+", "", text) # remove urls
        text = re.sub(r"@\w+", "", text) # remove mentions
        text = re.sub(r"#\w+", "", text) # remove hashtags
        text = re.sub(r"[^A-Za-z\s]", "", text) # remove special characters
        return text.strip()
    
    def tokenize_lemmatize(self, text):
        tokens = word_tokenize(text)
        tokens = [t for t in tokens if t not in stop_words]
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        return " ".join(tokens)
    
    def transform(self, X, y=None):
        X_filled = X.copy()
        
        # Fill missing values in the second column
        second_col = X_filled.columns[1]
        X_filled[second_col] = X_filled[second_col].fillna("Unknown")
        
        # Process the text column
        X_filled[self.text_column] = X_filled[self.text_column].apply(
            lambda t: self.tokenize_lemmatize(self.clean_text(t))
        )
        
        return X_filled  # return as DataFrame

    def fit(self, X, y=None):
        return self
    
# -------------------------------
# Load Dataset
# -------------------------------
    
df = pd.read_csv("Data\judge-1377884607_tweet_product_company.csv", encoding='Latin-1')

df.dropna(subset=["tweet_text"], inplace=True)

X = df["tweet_text"]

# -------------------------------
# Build Preprocessing Pipeline
# -------------------------------

preprocessing_pipeline = Pipeline([
    ("text_preprocessor", TextPreprocessor(text_column="tweet_text"))
])

# Apply Pipeline
df_preprocessed = preprocessing_pipeline.fit_transform(df)

# view processed tweets
print(df_preprocessed.head(10))


                                           tweet_text  \
0   I G iPhone After hr tweeting dead I need upgra...   
1   Know Awesome iPadiPhone app youll likely appre...   
2                             Can wait also They sale   
3    I hope year festival isnt crashy year iPhone app   
4   great stuff Fri Marissa Mayer Google Tim OReil...   
5   New iPad Apps For And Communication Are Showca...   
7   starting around corner hop skip jump good time...   
8     Beautifully smart simple idea RT wrote iPad app   
9   Counting day plus strong Canadian dollar mean ...   
10  Excited meet I show Sprint Galaxy S still runn...   

   emotion_in_tweet_is_directed_at  \
0                           iPhone   
1               iPad or iPhone App   
2                             iPad   
3               iPad or iPhone App   
4                           Google   
5                          Unknown   
7                          Android   
8               iPad or iPhone App   
9                            A