# Phase 4 Project - NLP
Julia Müller

Data Science Flex

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#train test split and undersampling
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
#packages for preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import string
from nltk import FreqDist
#packages for modeling and feature selection
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
#model evaluation
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, roc_curve, auc
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

## Loading and inspection of data

In [2]:
df = pd.read_csv("data/tweets.csv", encoding="latin-1")

In [3]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
# Set display options to show all rows and increase the column width
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
#simplify column names
df.columns = ['Tweet','Brand/Product','Emotion']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet          9092 non-null   object
 1   Brand/Product  3291 non-null   object
 2   Emotion        9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [5]:
df.head(100)

Unnamed: 0,Tweet,Brand/Product,Emotion
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.ly/ieaVOB,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Canadian dollar means stock up on Apple gear,Apple,Positive emotion


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet          9092 non-null   object
 1   Brand/Product  3291 non-null   object
 2   Emotion        9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


My dataset has more than 9000 tweets and is split in 3 columns. The first column is the tweet, the second one is the information if the tweet is directed at a specific product (Apple or Google) and the third one is the sentiment towards the product.
The second column only contains 3200 data points so we don't know about every of the 9000 tweets at which product they are directed at. Also the 3rd column shows for the majority of tweets no emotion. 
My next steps are to summarize the different products into the two brands Apple or Google and to check if the missing values in the product column really don't include any information about a product.

## Data Clearning

First, I will rename the different products and map them to the brand Apple or Google.

In [7]:
df["Brand/Product"].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: Brand/Product, dtype: int64

In [8]:
product_mapping = {
    "iPad": "Apple",
    "iPad or iPhone App": "Apple",
    "iPhone": "Apple",
    "Other Apple product or service": "Apple",
    "Other Google product or service": "Google",
    "Android App": "Google",
    "Android": "Google"
}


df["Brand"] = df["Brand/Product"].replace(product_mapping)
print(df["Brand"].value_counts())
print(df.info())

Apple     2409
Google     882
Name: Brand, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet          9092 non-null   object
 1   Brand/Product  3291 non-null   object
 2   Emotion        9093 non-null   object
 3   Brand          3291 non-null   object
dtypes: object(4)
memory usage: 284.3+ KB
None


Now, where this is cleaned up, I will look at the na values to see if there are no information connected to Apple or Google.

In [9]:
filtered_df = df[df['Brand'].isna()]

# Select the first 100 lines of column A from the filtered DataFrame
column_a_subset = filtered_df['Tweet']
column_a_subset[:100]

5           @teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd
6                                                                                                                                                    NaN
16                                                          Holler Gram for iPad on the iTunes App Store -  http://t.co/kfN3f5Q (via @marc_is_ken) #sxsw
32                                                   Attn: All  #SXSW frineds, @mention Register for #GDGTLive  and see Cobra iRadar for Android. {link}
33                                                                                                         Anyone at  #sxsw want to sell their old iPad?
34                                                                         Anyone at  #SXSW who bought the new iPad want to sell their older iPad to me?
35                                          At #sxsw.  Oooh. RT @mention Google to

It looks like there are indeed words in the tweets that will let us identify the brand from the comment. I will create a list of keywords and map them to the different brands

In [10]:
# Assign rows to Brand/Product for the unknown one

keywords = ['google', 'apple', 'ipad', 'android', 'iphone']

for index, row in df.iterrows():
    text = row['Tweet']
    if pd.isna(row['Brand/Product']) and isinstance(text, str):
        for keyword in keywords:
            if keyword in text.lower():
                df.at[index, 'Brand/Product'] = keyword
                break
# fill the rest with Unknown
df['Brand/Product'] = df['Brand/Product'].fillna('Unknown')

In [11]:
brand_keywords = {
    "Apple": ["ipad", "iphone", "itunes", "apple"],
    "Google": ["android", "google"]
}

# Iterate over the DataFrame and update the 'Brand' column for tweets with missing brand information
for index, row in df.iterrows():
    if pd.isna(row['Brand']):  # Check if the brand is NaN
        tweet = row['Tweet']
        if isinstance(tweet, str):  # Check if the tweet is a string
            tweet = tweet.lower()  # Transform the tweet to lowercase
            for brand, keywords in brand_keywords.items():
                for keyword in keywords:
                    if keyword in tweet:
                        df.at[index, 'Brand'] = brand
                        break  # Break the loop if a matching keyword is found
        else:
            df.at[index, 'Brand'] = 'unknown'  # Assign 'unknown' for NaN values in 'Brand' column

# Assign 'unknown' for any remaining NaN values in 'Brand' column
df['Brand'].fillna('unknown', inplace=True)


In [12]:
df["Brand"].value_counts()

Apple      5401
Google     2985
unknown     707
Name: Brand, dtype: int64

In [13]:
df["Brand/Product"].value_counts()

google                             1740
apple                              1195
ipad                               1069
iPad                                946
Unknown                             762
iphone                              710
Apple                               661
iPad or iPhone App                  470
Google                              430
android                             326
iPhone                              297
Other Google product or service     293
Android App                          81
Android                              78
Other Apple product or service       35
Name: Brand/Product, dtype: int64

We need to do a bit of clean up because of lower case and upper case values. I will map the different categories.

In [14]:
product_mapping = {
    "google": "Google",
    "apple": "Apple",
    "ipad": "iPad",
    "iphone": "iPhone",
    "android": "Android"
}
df["Brand/Product"] = df["Brand/Product"].replace(product_mapping)
print(df["Brand/Product"].value_counts())

Google                             2170
iPad                               2015
Apple                              1856
iPhone                             1007
Unknown                             762
iPad or iPhone App                  470
Android                             404
Other Google product or service     293
Android App                          81
Other Apple product or service       35
Name: Brand/Product, dtype: int64


Also, I want to check for duplicates or missing values and remove them

In [15]:
print("Before removal: ", df.duplicated().value_counts())
df.drop_duplicates(inplace=True)
print("After removal: ",df.duplicated().value_counts())

Before removal:  False    9071
True       22
dtype: int64
After removal:  False    9071
dtype: int64


In [16]:
#check for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9071 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet          9070 non-null   object
 1   Brand/Product  9071 non-null   object
 2   Emotion        9071 non-null   object
 3   Brand          9071 non-null   object
dtypes: object(4)
memory usage: 354.3+ KB


In [17]:
na_tweets = df[df['Tweet'].isna()]

# Print the rows where 'Tweet' is NaN
print(na_tweets)

  Tweet Brand/Product                             Emotion    Brand
6   NaN       Unknown  No emotion toward brand or product  unknown


In [18]:
# Remove the rows where 'Tweet' is NaN
df = df.dropna(subset=['Tweet'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9070 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet          9070 non-null   object
 1   Brand/Product  9070 non-null   object
 2   Emotion        9070 non-null   object
 3   Brand          9070 non-null   object
dtypes: object(4)
memory usage: 354.3+ KB


Next, I want to clean up the Emotion column. There are 4 different options: Positive emotion, negative emotion, no emotion and can't tell. The "no emotion" option is the most common one and since I want to create a binary classifier, I will leave positive as positive and combine the neutral and negative ones as non-positive. Also,  I will  drop the can't tell rows as they are only a very small fraction of the dataset. 

In [19]:
df["Emotion"].value_counts()

No emotion toward brand or product    5375
Positive emotion                      2970
Negative emotion                       569
I can't tell                           156
Name: Emotion, dtype: int64

In [20]:
#emotions dictionary for mapping
emotions = {
    "No emotion toward brand or product": "Non-positive",
    "Positive emotion": "Positive",
    "Negative emotion": "Non-positive"
}
#mapping old labels to new ones
df["Emotion"] = df["Emotion"].map(emotions)
#check for nas and drop them (can't tell)
print(df['Emotion'].isnull().sum())
# Drop NaN in the emotion column
df.dropna(subset = ["Emotion"], inplace = True)
#check for distribution
df["Emotion"].value_counts(normalize=True)

156


Non-positive    0.666816
Positive        0.333184
Name: Emotion, dtype: float64

In [21]:
#changing to numerical
df["Emotion"] = df["Emotion"].map({'Non-positive': 0, 'Positive': 1})

For my future target variable, I can note that I have a class imbalance. 66% of the cases are not positive, so I will perform different oversampling or undersampling techniques after the train test split to avoid data leakage.

## Exploratory Data Analysis
Before I start with splitting my dataset and doing the preprocessing, I want to get more familiar with the most frequent words.  In this section, I will see how my tokens look like so that I can remove certain stop words in my model.

## Basic preprocessing

Now that I have my dataframe cleaned up, I will start with preparing the tweet texts. Here are the decisions, I have taken:

Stop word removal: I will remove some basic stop words and use TF-IDF to apply weighting

Stemming or Lemmatization: In my basic model, I will try stemming and later try another model with lemmatization.

Tokenization: I will use a specific Tweet Tokenizer that handles hashtags and mentions

### Step 1: Tokenization using TweetTokenizer

In [22]:
#initialising Tokenizer 
tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
df['Tokens'] = df['Tweet'].apply(tknzr.tokenize)

#writing a function to get the 20 most common words
def get_most_common_words(df, column_name, N=20):
    # Flatten the list of tokens into a single list
    all_tokens = [token for tokens in df[column_name] for token in tokens]

    # Calculate the frequency distribution
    freq_dist = FreqDist(all_tokens)

    # Get the top N common words
    most_common_words = freq_dist.most_common(N)

    return most_common_words

# applying the function
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

[('#sxsw', 8916), ('.', 5771), ('the', 4346), ('link', 4249), ('}', 4234), ('{', 4231), ('to', 3529), (',', 3459), ('at', 3045), ('rt', 2918), ('for', 2503), ('ipad', 2367), ('!', 2338), ('a', 2295), ('google', 2082), ('in', 1887), (':', 1793), ('apple', 1778), ('of', 1676), ('is', 1668)]


In the list of most common words, there are a lot of common words / stopwords included that I will get rid of. Also, I will include "sxsw" which is an acronym for the Southwest Bank and "rt" which probably stands for retweet to my list of stopwords.I will first remove the stopwords and then see what else I can remove.

### Step 2: Removing stopwords

In [23]:
# Get the set of English stopwords
stop_words = set(stopwords.words('english'))
additional_stopwords = ["#sxsw", "sxsw", "sxswi", "#sxswi", "rt"]
stop_words.update(additional_stopwords)

# Function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Apply the remove_stopwords function to the 'tokens' column
df['Tokens'] = df['Tokens'].apply(remove_stopwords)
#get the top 20 words
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

[('.', 5771), ('link', 4249), ('}', 4234), ('{', 4231), (',', 3459), ('ipad', 2367), ('!', 2338), ('google', 2082), (':', 1793), ('apple', 1778), ('"', 1657), ('?', 1572), ('store', 1455), ('2', 1305), ('iphone', 1278), ('-', 1146), ('new', 1073), ('austin', 829), ('&', 827), ('app', 793)]


I will also remove the product specific words and treat them as stopwords. I have this information already in my brand and product column, so I know which Apple or Google product the tweet is about. Also, I will remove punctuation

In [49]:
additional_stopwords = [
    "ipad", "google", "apple", "iphone", "amp",
    "android", "sxswi", "link", "#apple",
    "#google", "...", "\x89", "#ipad2",
    "0","1","2","3","4","5","6","7","8","9",
    "#iphone", "#android", "store", "austin", "#ipad"]
stop_words.update(additional_stopwords)

# Apply the remove_stopwords function to the 'tokens' column
df['Tokens'] = df['Tokens'].apply(remove_stopwords)
# Remove punctuation from the tokens
df['Tokens'] = df['Tokens'].apply(lambda tokens: [token for token in tokens if token not in string.punctuation])
#get most common words
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

[('new', 1073), ('app', 793), ('launch', 633), ('social', 612), ('today', 560), ('circles', 552), ('network', 451), ('via', 428), ('pop-up', 411), ('line', 393), ('get', 391), ('free', 368), ('party', 347), ('called', 347), ('mobile', 305), ('major', 296), ('like', 280), ('one', 271), ('time', 269), ('temporary', 262)]


### Step 3: Check most common words for Apple and Google products

In [50]:
#splitting up data in brands and emotions
apple = df[df["Brand"]=="Apple"]
apple_pos = apple[apple["Emotion"]==1]
apple_nonpos = apple[apple["Emotion"]==0]
google = df[df["Brand"]=="Google"]
google_pos = google[google["Emotion"]==1]
google_nonpos = google[google["Emotion"]==0]

In [51]:
top_words = get_most_common_words(apple_pos, "Tokens", N=20)
print(top_words)

[('app', 309), ('new', 219), ('pop-up', 151), ('line', 123), ('get', 120), ('via', 103), ('one', 98), ("i'm", 96), ('cool', 96), ('temporary', 89), ('free', 88), ('opening', 87), ('downtown', 86), ('like', 81), ('go', 79), ('launch', 78), ('time', 78), ('great', 77), ('popup', 76), ('day', 73)]


In [52]:
top_words = get_most_common_words(apple_nonpos, "Tokens", N=20)
print(top_words)

[('app', 286), ('pop-up', 259), ('new', 257), ('line', 250), ('temporary', 172), ('opening', 166), ('get', 147), ('via', 137), ('free', 131), ('downtown', 130), ('popup', 129), ('open', 129), ('one', 121), ('launch', 118), ('pop', 102), ('like', 99), ('need', 96), ("i'm", 95), ('win', 93), ('people', 92)]


In [53]:
top_words = get_most_common_words(google_pos, "Tokens", N=20)
print(top_words)

[('new', 139), ('party', 105), ('circles', 105), ('social', 103), ('maps', 101), ('network', 84), ('launch', 81), ('mobile', 73), ('app', 72), ('mayer', 64), ('today', 63), ('called', 60), ('great', 59), ('marissa', 59), ("google's", 56), ('major', 54), ('time', 49), ('w', 41), ('possibly', 41), ('get', 38)]


In [54]:
top_words = get_most_common_words(google_nonpos, "Tokens", N=20)
print(top_words)

[('social', 439), ('circles', 433), ('new', 420), ('network', 345), ('launch', 331), ('today', 316), ('called', 262), ('major', 228), ('possibly', 187), ('mobile', 166), ('party', 140), ('via', 128), ('mayer', 124), ("google's", 121), ('marissa', 117), ('maps', 106), ('app', 86), ('#circles', 79), ('search', 77), ('bing', 69)]


## Starting with a baseline model
My steps for modeling are the following:
1. Removing stop-words 
2. Train-Test-Split
3. Address class imbalance
4. Build and train baseline model with basic preprocessing and a vectorizer
5. Evaluate the baseline model
6. Finetune the preprocessing
7. Potentially include other features
8. Iterate through different models

### Step 1: Removing stopwords
Based on my EDA, I will remove a specific list of stopwords that has to do with the sxsw festival and product related words that won't have a lot of value. 

In [55]:
#write function to remove stopwords
def remove_stopwords(tweet):
    stop_words = set(stopwords.words('english')) #basic stopwords
    additional_stopwords = [
        "#sxsw", "sxsw", "sxswi", "#sxswi", "rt","ipad",
        "google", "apple", "iphone", "amp",
        "android", "sxswi", "link", "#apple",
        "#google", "...", "\x89", "#ipad2",
        "0","1","2","3","4","5","6","7","8","9",
        "#iphone", "#android", "store", "austin", "#ipad"
    ] + list(string.punctuation) 
    stop_words.update(additional_stopwords)
    
    filtered_tweet = ' '.join([word for word in tweet.split() if word.lower() not in stop_words])
    return filtered_tweet
# add column with filtered tweets
df["Tweets_filtered"] = df["Tweet"].apply(remove_stopwords)

### Step 2: Train-Test-Split
To avoid data leakage, I will now split my cleaned dataset into train and test data

In [56]:
# Split the dataset into training and testing sets
X = df[['Tweets_filtered']]  # Feature
y = df['Emotion']  # Target variable

# Split the data into 75% training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Print the shapes of the resulting datasets
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)
# Reshape X_train and X_test
X_train = X_train.values.ravel()
X_test = X_test.squeeze()
# Print the updated shapes
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)

Training data shape: (6685, 1) (6685,)
Testing data shape: (2229, 1) (2229,)
Training data shape: (6685,) (6685,)
Testing data shape: (2229,) (2229,)


### Step 2: Building a base model
I will now build a logistic regression model with random oversampling and Count vectorization but without applying a specific tokenizer. 

In [57]:
# Define pipeline
pipe_lr = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('oversample', RandomOverSampler(random_state=42)),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the resampled training data
pipe_lr.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('oversample', RandomOverSampler(random_state=42)),
                ('classifier', LogisticRegression())])

In [58]:
# Predict on the test data
y_pred = pipe_lr.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.72
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.79      0.79      1490
           1       0.58      0.59      0.59       739

    accuracy                           0.72      2229
   macro avg       0.69      0.69      0.69      2229
weighted avg       0.72      0.72      0.72      2229



The classification report of the linear regression model indicates moderate performance. Precision was higher for class 0 (0.80) compared to class 1 (0.58), same as recall which was higher for class 0 (0.78) compared to class 1 (0.60). The F1-score was also higher for class 0 (0.79) compared to class 1 (0.59). The overall accuracy of the model was 0.78. 

Next, I will try the same model but with TweetTokenizer. Based on my EDA, I will do tokenization by using TweetTokenizer that handles hashtags and mentions.

In [59]:
# Instantiate tweet tokenizer to later include in the pipeline
tokenizer = TweetTokenizer()
lr_pipe_tknzr = Pipeline([('vectorizer', CountVectorizer(tokenizer=tokenizer.tokenize)),
                    ('oversample', RandomOverSampler(random_state=42)),
                    ('lr', LogisticRegression(random_state=42))])

lr_pipe_tknzr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x0000020D04629970>>)),
                ('oversample', RandomOverSampler(random_state=42)),
                ('lr', LogisticRegression(random_state=42))])

In [60]:
# Predict on the test data
y_pred = lr_pipe_tknzr.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.74
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.79      0.80      1490
           1       0.60      0.62      0.61       739

    accuracy                           0.74      2229
   macro avg       0.70      0.71      0.70      2229
weighted avg       0.74      0.74      0.74      2229



This slightly improved the model. I will try a different vectorization (TFIDF) to see if that improves the model

In [61]:
lr_pipe_tknzr_oversample = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=tokenizer.tokenize)),
    ('oversample', RandomOverSampler(random_state=42)),
    ('lr', LogisticRegression(random_state=42))
])

lr_pipe_tknzr_oversample.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x0000020D04629970>>)),
                ('oversample', RandomOverSampler(random_state=42)),
                ('lr', LogisticRegression(random_state=42))])

In [62]:
# Predict on the test data
y_pred = lr_pipe_tknzr_oversample.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.72
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.76      0.78      1490
           1       0.57      0.63      0.60       739

    accuracy                           0.72      2229
   macro avg       0.69      0.70      0.69      2229
weighted avg       0.73      0.72      0.72      2229



This is worse than before. I will go back to CountVectorizer but try a different classifier (random forest).

In [63]:
# Pipeline with Random Forest
rfc_pipe = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=tokenizer.tokenize)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('rfc', RandomForestClassifier(random_state=42))
])
rfc_pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x0000020D04629970>>)),
                ('undersample', RandomUnderSampler(random_state=42)),
                ('rfc', RandomForestClassifier(random_state=42))])

In [64]:
# Predict on the test data
y_pred = rfc_pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.70
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.71      0.76      1490
           1       0.54      0.68      0.60       739

    accuracy                           0.70      2229
   macro avg       0.68      0.69      0.68      2229
weighted avg       0.72      0.70      0.71      2229



Now, I will try a GridSearch to tune hyperparameters.

In [90]:
# Instantiate tweet tokenizer to later include in the pipeline
tokenizer = TweetTokenizer()

# Define the pipeline
lr_pipe_grid = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=tokenizer.tokenize, ngram_range=(1, 3))),
    ('oversample', RandomOverSampler(random_state=42)),
    ('lr', LogisticRegression(random_state=42))
])

# Define the hyperparameter grid
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'lr__C': [0.1, 1, 10],
    'lr__solver': ['liblinear', 'sag', 'saga'],
    'lr__max_iter': [100, 1000, 10000]
}

# Perform grid search
grid_search = GridSearchCV(lr_pipe_grid, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predict on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)









Accuracy: 0.7388963660834454
Best Hyperparameters: {'lr__C': 1, 'lr__max_iter': 100, 'lr__solver': 'liblinear', 'vectorizer__ngram_range': (1, 3)}


Accuracy: The RFC achieved an accuracy of 0.73, indicating that 73% of the predictions were correct. This is better than the previous model.

Class 0 (majority class) had a precision of 0.76, recall of 0.87, and an F1-score of 0.81. This means that the RFC performed relatively well in identifying class 0 instances.

Class 1 (minority class) had a lower precision of 0.63, recall of 0.45, and an F1-score of 0.53. The RFC had more difficulty correctly identifying class 1 instances.

The macro average F1-score, which considers the average performance across both classes, was 0.67. The weighted average F1-score, which accounts for class imbalances, was slightly higher at 0.72.

In summary, while the RFC achieved a reasonable accuracy, it struggled to correctly classify instances of the minority class (class 1) compared to the majority class (class 0). 

In [41]:
rfc_pipe_undersample_weighted = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('rfc', RandomForestClassifier(class_weight={0: 0.7, 1: 0.3}, random_state=42))
])

rfc_pipe_undersample_weighted.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words={'!', '"', '#', '#apple', '#google',
                                             '#sxsw', '#sxswi', '$', '%', '&',
                                             "'", '(', ')', '*', '+', ',', '-',
                                             '.', '...', '/', '0', '1', '2',
                                             '3', '4', '5', '6', '7', '8', '9', ...},
                                 tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x00000127FF741820>>)),
                ('undersample', RandomUnderSampler(random_state=42)),
                ('rfc',
                 RandomForestClassifier(class_weight={0: 0.7, 1: 0.3},
                                        random_state=42))])

In [42]:
# Predict on the test data
y_pred = rfc_pipe_undersample_weighted.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.69
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.73      0.76      1490
           1       0.53      0.62      0.57       739

    accuracy                           0.69      2229
   macro avg       0.66      0.67      0.67      2229
weighted avg       0.71      0.69      0.70      2229



In [67]:
# Pipeline with Naive Bayes
svm_pipe = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)),
    ('smote', SMOTE(sampling_strategy=1, random_state=42)),
    ('svm', SVC(random_state=42))
])

svm_pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words={'#android', '#apple', '#google',
                                             '#ipad', '#ipad2', '#iphone',
                                             '#sxsw', '#sxswi', '...', '0', '1',
                                             '2', '3', '4', '5', '6', '7', '8',
                                             '9', 'a', 'about', 'above',
                                             'after', 'again', 'against', 'ain',
                                             'all', 'am', 'amp', 'an', ...},
                                 tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x0000020D04629970>>)),
                ('smote', SMOTE(random_state=42, sampling_strategy=1)),
                ('svm', SVC(random_state=42))])

In [68]:
# Predict on the test data
y_pred = svm_pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.71
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.80      0.78      1490
           1       0.56      0.52      0.54       739

    accuracy                           0.71      2229
   macro avg       0.67      0.66      0.66      2229
weighted avg       0.70      0.71      0.70      2229



### Step 3: Baseline model
I will do TF-IDF vectorization as my features for the baseline model. I will start with a simple logistic regression model and then continue to iterate after evaluation.


Next, I will build a baseline logistic regression model with TF-IDF vectorization. In later models, I will remove stopwords and do lemmatization.

In [29]:
print(y_train.shape)
print(X_train.shape)

(6685,)
(6685, 1)


In [56]:
# Reshape X_train_resampled and X_test
X_train_resampled = X_train_resampled.values.ravel()
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train_resampled)

# Reshape X_test
X_test = X_test.squeeze()

# Transform the test data using the fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train_resampled)

LogisticRegression()

Now, I will evaluate the baseline model by calculating the accuracy and a classification report.

In [57]:
# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.71
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.74      0.78      1490
           1       0.56      0.64      0.60       739

    accuracy                           0.71      2229
   macro avg       0.68      0.69      0.69      2229
weighted avg       0.72      0.71      0.72      2229



The classification report of the linear regression model indicates moderate performance. Precision was higher for class 0 (0.82) compared to class 1 (0.50), while recall was higher for class 1 (0.70) compared to class 0 (0.66). The F1-score was also higher for class 0 (0.73) compared to class 1 (0.59). The overall accuracy of the model was 0.67. 

In [132]:
# Reshape X_test
X_test = X_test.squeeze()
# predict target with model on testing set
y_pred = lr_pipe.predict(X_test)
# print a classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.69      0.75      1490
           1       0.53      0.70      0.60       739

    accuracy                           0.69      2229
   macro avg       0.67      0.69      0.67      2229
weighted avg       0.72      0.69      0.70      2229



The classification report of the basic logistic regression model states an accuracy of .69. This means that 69% of the instances in the test set were correctly classified. The precision score indicates that the model is rather good at predicting the non-positive cases but not the positive ones. 
Based on my EDA (other notebook), I will do tokenization by using TweetTokenizer that handles hashtags and mentions and remove stop words (the product names, often ocurring mentions without value (sxsw, rt)). 
First, I will instantiate my tokenizer and then define my list of stopwords so that I can include it in the model.

In [61]:
# Preprocess the text using NLP techniques
# Convert X_train and X_test back to pandas Series
X_train_resampled = pd.Series(X_train_resampled)
X_test_resampled = pd.Series(X_test)
#initialising Tokenizer 
tokenizer = TweetTokenizer(preserve_case=False)
#defining stop words
stop_words = set(stopwords.words('english'))
additional_stopwords = [
    "#sxsw", "sxsw", "sxswi", "#sxswi", "rt","ipad",
    "google", "apple", "iphone", "amp",
    "android", "sxswi", "link", "#apple",
    "#google", "...",
    "0","1","2","3","4","5","6","7","8","9"
] + list(string.punctuation) 
stop_words.update(additional_stopwords)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization using TweetTokenizer
    tokens = tokenizer.tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply preprocessing to the training data
X_train_preprocessed = X_train_resampled.apply(preprocess_text)

# Apply preprocessing to the test data
X_test_preprocessed = X_test.apply(preprocess_text)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed training data
X_train_tfidf = vectorizer.fit_transform(X_train_preprocessed)

# Transform the preprocessed test data using the fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test_preprocessed)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train_resampled)

# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.68
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.69      0.74      1490
           1       0.52      0.68      0.59       739

    accuracy                           0.68      2229
   macro avg       0.67      0.68      0.67      2229
weighted avg       0.72      0.68      0.69      2229



## Text Preprocessing

Now that I have my dataframe cleaned up, I will start with preparing the tweet texts. Here are the decisions, I have taken:

Stop word removal: I will remove some basic stop words and use TF-IDF to apply weighting

Stemming or Lemmatization: In my basic model, I will try stemming and later try another model with lemmatization.

Tokenization: I will use a specific Tweet Tokenizer that handles hashtags and mentions

### Step 1: Tokenization using TweetTokenizer

In [192]:
#initialising Tokenizer 
tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
df['Tokens'] = df['Tweet'].apply(tknzr.tokenize)

In [193]:
#initialising Tokenizer 
tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
df['Tokens'] = df['Tweet'].apply(tknzr.tokenize)

#writing a function to get the 20 most common words
def get_most_common_words(df, column_name, N=20):
    # Flatten the list of tokens into a single list
    all_tokens = [token for tokens in df[column_name] for token in tokens]

    # Calculate the frequency distribution
    freq_dist = FreqDist(all_tokens)

    # Get the top N common words
    most_common_words = freq_dist.most_common(N)

    return most_common_words

# applying the function
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

[('#sxsw', 8916), ('.', 5771), ('the', 4346), ('link', 4249), ('}', 4234), ('{', 4231), ('to', 3529), (',', 3459), ('at', 3045), ('rt', 2918), ('for', 2503), ('ipad', 2367), ('!', 2338), ('a', 2295), ('google', 2082), ('in', 1887), (':', 1793), ('apple', 1778), ('of', 1676), ('is', 1668)]


In the list of most common words, there are a lot of common words / stopwords included that I will get rid of. Also, I will include "sxsw" which is an acronym for the Southwest Bank and "rt" which probably stands for retweet to my list of stopwords.I will first remove the stopwords and then see what else I can remove.

### Step 2: Removing Stopwords

In [194]:
# Get the set of English stopwords
stop_words = set(stopwords.words('english'))
additional_stopwords = ["#sxsw", "sxsw", "sxswi", "#sxswi", "rt"]
stop_words.update(additional_stopwords)

# Function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Apply the remove_stopwords function to the 'tokens' column
df['Tokens'] = df['Tokens'].apply(remove_stopwords)
#get the top 20 words
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

In [195]:
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

[('.', 5771), ('link', 4249), ('}', 4234), ('{', 4231), (',', 3459), ('ipad', 2367), ('!', 2338), ('google', 2082), (':', 1793), ('apple', 1778), ('"', 1657), ('?', 1572), ('store', 1455), ('2', 1305), ('iphone', 1278), ('-', 1146), ('new', 1073), ('austin', 829), ('&', 827), ('app', 793)]


I will also remove the product specific words and treat them as stopwords. I have this information already in my brand and product column, so I know which Apple or Google product the tweet is about. Also, I will remove punctuation

In [196]:
additional_stopwords = [
    "ipad", "google", "apple", "iphone", "amp",
    "android", "sxswi", "link", "#apple",
    "#google", "...",
    "0","1","2","3","4","5","6","7","8","9"]
stop_words.update(additional_stopwords)

# Apply the remove_stopwords function to the 'tokens' column
df['Tokens'] = df['Tokens'].apply(remove_stopwords)
# Remove punctuation from the tokens
df['Tokens'] = df['Tokens'].apply(lambda tokens: [token for token in tokens if token not in string.punctuation])
#get most common words
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

In [197]:
top_words = get_most_common_words(df, 'Tokens', N=20)
print(top_words)

[('store', 1455), ('2', 1305), ('new', 1073), ('austin', 829), ('app', 793), ('\x89', 676), ('launch', 633), ('social', 612), ('today', 560), ('circles', 552), ('network', 451), ('via', 428), ('pop-up', 411), ('line', 393), ('get', 391), ('free', 368), ('party', 347), ('called', 347), ('mobile', 305), ('major', 296)]


## Feature engineering

In [201]:
# Convert lemmatized tokens back to text
df['processed_text'] = df['Tokens'].apply(lambda tokens: ' '.join(tokens))

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the processed text to obtain the TF-IDF features
tfidf_features = vectorizer.fit_transform(df['processed_text'])

# Get the feature names (words) corresponding to the columns in the TF-IDF matrix
feature_names = vectorizer.get_feature_names()

# Convert the TF-IDF features to a DataFrame for further analysis
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=feature_names)

Unnamed: 0,00,000,01,02,03,0310apple,06,08,10,100,...,ûò,ûòand,ûó,ûócan,ûójust,ûólewis,ûólots,ûómy,ûóthe,ü_
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model selection and training

## Model evaluation