In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
import string
import emoji
from nltk.corpus import stopwords
import ssl
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Handle SSL certificate verification
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download ALL necessary resources in one go
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('perluniprops')
nltk.download('universal_tagset')
nltk.download('stopwords')

# Clear the NLTK cache
nltk.data.clear_cache()

[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to C:\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_data = pd.read_csv('./dataset/twitter_training.csv', header=None)
val_data = pd.read_csv('./dataset/twitter_validation.csv', header=None)

In [4]:
train_data.columns = ['id', 'company', 'sentiment', 'text']
val_data.columns = ['id', 'company', 'sentiment', 'text']
train_data

Unnamed: 0,id,company,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
train_data['id'].value_counts()

id
2401    6
6164    6
6141    6
6142    6
6143    6
       ..
4678    6
4679    6
4680    6
4681    6
9200    6
Name: count, Length: 12447, dtype: int64

In [6]:
train_data['company'].value_counts()

company
TomClancysRainbowSix                 2400
MaddenNFL                            2400
Microsoft                            2400
LeagueOfLegends                      2394
CallOfDuty                           2394
Verizon                              2382
CallOfDutyBlackopsColdWar            2376
ApexLegends                          2376
Facebook                             2370
WorldOfCraft                         2364
Dota2                                2364
NBA2K                                2352
TomClancysGhostRecon                 2346
Battlefield                          2346
FIFA                                 2340
Xbox(Xseries)                        2334
Overwatch                            2334
johnson&johnson                      2328
Amazon                               2316
PlayStation5(PS5)                    2310
HomeDepot                            2310
Cyberpunk2077                        2304
CS-GO                                2304
GrandTheftAuto(GTA)       

In [7]:
train_data.drop(['id', 'company'], axis=1, inplace=True)
val_data.drop(['id', 'company'], axis=1, inplace=True)

In [8]:
train_data

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  74682 non-null  object
 1   text       73996 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [10]:
val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  1000 non-null   object
 1   text       1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [11]:
train_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq
sentiment,74682,4,Negative,22542
text,73996,69491,"At the same time, despite the fact that there ...",172


In [12]:
val_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq
sentiment,1000,4,Neutral,285
text,1000,999,Wow,2


In [13]:
print(train_data.shape)
print(val_data.shape)

(74682, 2)
(1000, 2)


In [18]:
train_data.isnull().mean()

sentiment    0.000000
text         0.009186
dtype: float64

In [24]:
train_data.isnull().any(axis=0).sum()

1

In [19]:
val_data.isnull().mean()

sentiment    0.0
text         0.0
dtype: float64

In [22]:
print(train_data.duplicated().mean())
print(val_data.duplicated().mean())

0.0657320371709381
0.001


In [10]:
train_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)
train_data.shape

(71656, 4)

In [11]:
# Initialize constants
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
HTML_PATTERN = re.compile(r'<.*?>')

# Chat word mappings
CHAT_WORDS = {
    'AFAIK': 'As Far As I Know', 'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back',
    'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before',
    'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed',
    'FWIW': "For What It's Worth", 'FYI': 'For Your Information',
    'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius',
    'IC': 'I See', 'ICQ': 'I Seek you', 'ILU': 'I Love You',
    'IMHO': 'In My Honest Opinion', 'IMO': 'In My Opinion',
    'IOW': 'In Other Words', 'IRL': 'In Real Life',
    'KISS': 'Keep It Simple', 'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My Off', 'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See', 'L8R': 'Later',
    'MTE': 'My Thoughts Exactly', 'M8': 'Mate',
    'NRN': 'No Reply Necessary', 'OIC': 'Oh I See',
    'PITA': 'Pain In The A', 'PRT': 'Party',
    'PRW': 'Parents Are Watching', 'QPSA': 'Que Pasa',
    'ROFL': 'Rolling On Floor Laughing', 'ROFLOL': 'Rolling On Floor Laughing Out Loud',
    'SK8': 'Skate', 'THX': 'Thank You', 'TTFN': 'Ta-Ta For Now',
    'TTYL': 'Talk To You Later', 'U': 'You', 'U2': 'You Too',
    'U4E': 'Yours For Ever', 'WB': 'Welcome Back',
    'WTF': 'What The F', 'WTG': 'Way To Go',
    'WUF': 'Where Are You From', 'W8': 'Wait',
    'TFW': 'That feeling when', 'MFW': 'My face when',
    'MRW': 'My reaction when', 'IFYP': 'I feel your pain',
    'TNTL': 'Trying not to laugh', 'JK': 'Just kidding',
    'IDC': "I don't care", 'ILY': 'I love you',
    'IMU': 'I miss you', 'ADIH': 'Another day in hell',
    'ZZZ': 'Sleeping', 'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes', 'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart', 'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter', 'BFF': 'Best friends forever',
    'CSL': "Can't stop laughing"
}

def preprocess_text(text):
    """
    Comprehensive text preprocessing function that combines all steps
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove emojis using emoji package
    text = emoji.replace_emoji(text, '')
    
    # Remove URLs and HTML
    text = URL_PATTERN.sub('', text)
    text = HTML_PATTERN.sub('', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Split into words and process
    words = text.split()
    
    # Convert chat words
    processed_words = []
    for word in words:
        # Check for chat words
        if word.upper() in CHAT_WORDS:
            word_expanded = CHAT_WORDS[word.upper()]
            # Add words from expanded chat words
            processed_words.extend(word_expanded.lower().split())
        else:
            processed_words.append(word)
    
    return ' '.join(processed_words)

# Apply preprocessing to both datasets
train_data['cleaned'] = train_data['text'].apply(preprocess_text)
val_data['cleaned'] = val_data['text'].apply(preprocess_text)

In [10]:
# from autocorrect import Speller

# spell = Speller(lang="en")  # English spell checker

# train_data['lower'] = train_data['lower'].apply(lambda text: spell(text))
# val_data['lower'] = val_data['lower'].apply(lambda text: spell(text))

In [11]:
# import spacy
# nlp = spacy.load('en_core_web_sm')

# def lemmatized(text):
#   doc = nlp(text)
#   return ' '.join([token.lemma_ for token in doc])

# train_data['lower'] = train_data['lower'].apply(lambda text: lemmatized(text))
# val_data['lower'] = val_data['lower'].apply(lambda text: lemmatized(text))

In [14]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

train_data

Unnamed: 0,id,company,type,text,cleaned
0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,",im getting on borderlands and i will murder you all
1,2401,Borderlands,Positive,"I am coming to the borders and I will kill you all,",i am coming to the borders and i will kill you all
2,2401,Borderlands,Positive,"im getting on borderlands and i will kill you all,",im getting on borderlands and i will kill you all
3,2401,Borderlands,Positive,"im coming on borderlands and i will murder you all,",im coming on borderlands and i will murder you all
4,2401,Borderlands,Positive,"im getting on borderlands 2 and i will murder you me all,",im getting on borderlands 2 and i will murder you me all
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my Mac is like 6 years behind Nvidia drivers and I have no idea how I did not notice,just realized that the windows partition of my mac is like 6 years behind nvidia drivers and i have no idea how i did not notice
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is 6 years behind on Nvidia drivers and I have no idea how I didn't notice,just realized that my mac window partition is 6 years behind on nvidia drivers and i have no idea how i didnt notice
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac is now 6 years behind on Nvidia drivers and I have no idea how he didn’t notice,just realized the windows partition of my mac is now 6 years behind on nvidia drivers and i have no idea how he didn’t notice
74680,9200,Nvidia,Positive,Just realized between the windows partition of my Mac is like being 6 years behind on Nvidia drivers and cars I have no fucking idea how I ever didn ’ t notice,just realized between the windows partition of my mac is like being 6 years behind on nvidia drivers and cars i have no fucking idea how i ever didn ’ t notice


In [15]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

val_data

Unnamed: 0,id,information,type,text,cleaned
0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣",i mentioned on facebook that i was struggling for motivation to go for a run the other day which has been translated by tom’s great auntie as ‘hayley can’t get out of bed’ and told to his grandma who now thinks i’m a lazy terrible person
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects claims company acted like a 'drug dealer' bbc.co.uk/news/av/busine…,bbc news amazon boss jeff bezos rejects claims company acted like a drug dealer bbccouknewsavbusine…
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it functions so poorly on my @SamsungUS Chromebook? 🙄,microsoft why do i pay for word when it functions so poorly on my samsungus chromebook
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking, it's a truly awful game.",csgo matchmaking is so full of closet hacking its a truly awful game
4,4433,Google,Neutral,Now the President is slapping Americans in the face that he really did commit an unlawful act after his acquittal! From Discover on Google vanityfair.com/news/2020/02/t…,now the president is slapping americans in the face that he really did commit an unlawful act after his acquittal from discover on google vanityfaircomnews202002t…
...,...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,"⭐️ Toronto is the arts and culture capital of Canada, it’s no wonder! If you want to start planning, be sure to check out our GTA Real Estate market report for Fall 2020, it has all the info you need to finally make a move! blog.remax.ca/toronto-housin… twitter.com/kevinyoufool/s…",toronto is the arts and culture capital of canada it’s no wonder if you want to start planning be sure to check out our gta real estate market report for fall 2020 it has all the info you need to finally make a move blogremaxcatorontohousin… twittercomkevinyoufools…
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VIEWERS.\n\nI was one of those people who got hooked into csgo by watching tournaments first before playing the game. And seeing these players grew is like a netflix docu series for me. Can't wait for 2021.,this is actually a good move tot bring more viewers i was one of those people who got hooked into csgo by watching tournaments first before playing the game and seeing these players grew is like a netflix docu series for me cant wait for 2021
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play borderlands until the sun comes up so I can hate myself all day tomorrow.,today sucked so it’s tears in my eyes to drink wine n play borderlands until the sun comes up so i can hate myself all day tomorrow
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.,bought a fraction of microsoft today small wins


In [16]:
train_data['type'] = train_data['type'].replace('Irrelevant', 'Neutral')
val_data['type'] = val_data['type'].replace('Irrelevant', 'Neutral')

In [17]:
train_data['type'].value_counts() / len(train_data)

type
Neutral     0.422086
Negative    0.302808
Positive    0.275106
Name: count, dtype: float64

In [18]:
val_data['type'].value_counts() / len(val_data)

type
Neutral     0.457
Positive    0.277
Negative    0.266
Name: count, dtype: float64

In [16]:
tokens_text = [word_tokenize((word)) for word in train_data['cleaned']]
tokens_counter = [item for sublist in tokens_text for item in sublist]

print(len(set(tokens_counter)))

40119


In [51]:
bow_count = TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 4))

reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, shuffle=True, random_state=0, stratify=train_data['type'])

x_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
x_test_bow = bow_count.transform(reviews_test['cleaned'])
x_val_bow = bow_count.transform(val_data['cleaned'])

y_train = reviews_train['type']
y_test = reviews_test['type']
y_val = val_data['type']

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_val_le = le.transform(y_val)

smote = SMOTE(random_state=4, sampling_strategy='not majority')
x_train_sm, y_train_sm = smote.fit_resample(x_train_bow, y_train_le)



In [70]:
best_score = 0
best_params = {}

for c in [1, 5, 10]:
    for iter in [20, 50, 100]:
        model = LogisticRegression(C=c, solver="liblinear", max_iter=iter)
        model.fit(x_train_sm, y_train_sm)
        test_pred = model.predict(x_test_bow)
        acc = accuracy_score(y_test_le, test_pred)
        print(f"c={c}, max_iter={iter} → Accuracy: {acc}")
        
        if acc > best_score:
            best_score = acc
            best_params = {'c': c, 'max_iter': iter}

print("Best Accuracy:", best_score)
print("Best Params:", best_params)

c=1, max_iter=20 → Accuracy: 0.8672202065308401
c=1, max_iter=50 → Accuracy: 0.8672202065308401
c=1, max_iter=100 → Accuracy: 0.8672202065308401




c=5, max_iter=20 → Accuracy: 0.9157130895897293
c=5, max_iter=50 → Accuracy: 0.915852637454647
c=5, max_iter=100 → Accuracy: 0.915852637454647




c=10, max_iter=20 → Accuracy: 0.9139687412782584
c=10, max_iter=50 → Accuracy: 0.9231789003628245
c=10, max_iter=100 → Accuracy: 0.9231789003628245
Best Accuracy: 0.9231789003628245
Best Params: {'c': 10, 'max_iter': 50}


In [71]:
model2 = LogisticRegression(random_state=4, C=5, solver="liblinear", max_iter=50)
model2.fit(x_train_sm, y_train_sm)
test_pred_2 = model2.predict(x_test_bow)
print(accuracy_score(y_test_le, test_pred_2) * 100)

val_pred_2 = model2.predict(x_val_bow)
accuracy_score(y_val_le, val_pred_2) * 100

91.5852637454647


98.7

In [59]:
best_score = 0
best_params = {}

for c in [1, 3, 5]:
    for iter in [1, 10]:
        model = LinearSVC(C=c, max_iter=iter)
        model.fit(x_train_sm, y_train_sm)
        test_pred = model.predict(x_test_bow)
        acc = accuracy_score(y_test_le, test_pred)
        print(f"c={c}, max_iter={iter} → Accuracy: {acc}")
        
        if acc > best_score:
            best_score = acc
            best_params = {'c': c, 'max_iter': iter}

print("Best Accuracy:", best_score)
print("Best Params:", best_params)



c=1, max_iter=1 → Accuracy: 0.8771978788724533




c=1, max_iter=10 → Accuracy: 0.9326681551772258




c=3, max_iter=1 → Accuracy: 0.8745464694390176




c=3, max_iter=10 → Accuracy: 0.9350404688808261




c=5, max_iter=1 → Accuracy: 0.885221881105219
c=5, max_iter=10 → Accuracy: 0.9354591124755791
Best Accuracy: 0.9354591124755791
Best Params: {'c': 5, 'max_iter': 10}




In [61]:
model3 = LinearSVC(random_state=4, C=3, max_iter=10)
model3.fit(x_train_sm, y_train_sm)
test_pred_3 = model3.predict(x_test_bow)
print(accuracy_score(y_test_le, test_pred_3) * 100)

val_pred_3 = model3.predict(x_val_bow)
accuracy_score(y_val_le, val_pred_3) * 100

93.41334077588613




98.8