In [18]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
import string
import emoji
from nltk.corpus import stopwords
import ssl
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Handle SSL certificate verification
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download ALL necessary resources in one go
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('perluniprops')
nltk.download('universal_tagset')
nltk.download('stopwords')

# Clear the NLTK cache
nltk.data.clear_cache()

[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to C:\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_data = pd.read_csv('./dataset/twitter_training.csv', header=None)
val_data = pd.read_csv('./dataset/twitter_validation.csv', header=None)

In [4]:
train_data.columns = ['id', 'company', 'type', 'text']
val_data.columns = ['id', 'information', 'type', 'text']
train_data

Unnamed: 0,id,company,type,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
print(train_data.shape)
print(val_data.shape)

(74682, 4)
(1000, 4)


In [6]:
print(train_data.isnull().sum() / len(train_data))
print(val_data.isnull().sum() / len(val_data))

id         0.000000
company    0.000000
type       0.000000
text       0.009186
dtype: float64
id             0.0
information    0.0
type           0.0
text           0.0
dtype: float64


In [7]:
print(train_data.duplicated().sum() / len(train_data))
print(val_data.duplicated().sum() / len(val_data))

0.036153289949385395
0.0


In [8]:
train_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)
train_data.shape

(71656, 4)

In [9]:
# Initialize constants
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
HTML_PATTERN = re.compile(r'<.*?>')

# Chat word mappings
CHAT_WORDS = {
    'AFAIK': 'As Far As I Know', 'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back',
    'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before',
    'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed',
    'FWIW': "For What It's Worth", 'FYI': 'For Your Information',
    'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius',
    'IC': 'I See', 'ICQ': 'I Seek you', 'ILU': 'I Love You',
    'IMHO': 'In My Honest Opinion', 'IMO': 'In My Opinion',
    'IOW': 'In Other Words', 'IRL': 'In Real Life',
    'KISS': 'Keep It Simple', 'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My Off', 'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See', 'L8R': 'Later',
    'MTE': 'My Thoughts Exactly', 'M8': 'Mate',
    'NRN': 'No Reply Necessary', 'OIC': 'Oh I See',
    'PITA': 'Pain In The A', 'PRT': 'Party',
    'PRW': 'Parents Are Watching', 'QPSA': 'Que Pasa',
    'ROFL': 'Rolling On Floor Laughing', 'ROFLOL': 'Rolling On Floor Laughing Out Loud',
    'SK8': 'Skate', 'THX': 'Thank You', 'TTFN': 'Ta-Ta For Now',
    'TTYL': 'Talk To You Later', 'U': 'You', 'U2': 'You Too',
    'U4E': 'Yours For Ever', 'WB': 'Welcome Back',
    'WTF': 'What The F', 'WTG': 'Way To Go',
    'WUF': 'Where Are You From', 'W8': 'Wait',
    'TFW': 'That feeling when', 'MFW': 'My face when',
    'MRW': 'My reaction when', 'IFYP': 'I feel your pain',
    'TNTL': 'Trying not to laugh', 'JK': 'Just kidding',
    'IDC': "I don't care", 'ILY': 'I love you',
    'IMU': 'I miss you', 'ADIH': 'Another day in hell',
    'ZZZ': 'Sleeping', 'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes', 'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart', 'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter', 'BFF': 'Best friends forever',
    'CSL': "Can't stop laughing"
}

def preprocess_text(text):
    """
    Comprehensive text preprocessing function that combines all steps
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove emojis using emoji package
    text = emoji.replace_emoji(text, '')
    
    # Remove URLs and HTML
    text = URL_PATTERN.sub('', text)
    text = HTML_PATTERN.sub('', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Split into words and process
    words = text.split()
    
    # Convert chat words
    processed_words = []
    for word in words:
        # Check for chat words
        if word.upper() in CHAT_WORDS:
            word_expanded = CHAT_WORDS[word.upper()]
            # Add words from expanded chat words
            processed_words.extend(word_expanded.lower().split())
        else:
            processed_words.append(word)
    
    return ' '.join(processed_words)

# Apply preprocessing to both datasets
train_data['cleaned'] = train_data['text'].apply(preprocess_text)
val_data['cleaned'] = val_data['text'].apply(preprocess_text)

In [10]:
# from autocorrect import Speller

# spell = Speller(lang="en")  # English spell checker

# train_data['lower'] = train_data['lower'].apply(lambda text: spell(text))
# val_data['lower'] = val_data['lower'].apply(lambda text: spell(text))

In [11]:
# import spacy
# nlp = spacy.load('en_core_web_sm')

# def lemmatized(text):
#   doc = nlp(text)
#   return ' '.join([token.lemma_ for token in doc])

# train_data['lower'] = train_data['lower'].apply(lambda text: lemmatized(text))
# val_data['lower'] = val_data['lower'].apply(lambda text: lemmatized(text))

In [12]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

train_data.head(20)

Unnamed: 0,id,company,type,text,cleaned
0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,",im getting on borderlands and i will murder you all
1,2401,Borderlands,Positive,"I am coming to the borders and I will kill you all,",i am coming to the borders and i will kill you all
2,2401,Borderlands,Positive,"im getting on borderlands and i will kill you all,",im getting on borderlands and i will kill you all
3,2401,Borderlands,Positive,"im coming on borderlands and i will murder you all,",im coming on borderlands and i will murder you all
4,2401,Borderlands,Positive,"im getting on borderlands 2 and i will murder you me all,",im getting on borderlands 2 and i will murder you me all
5,2401,Borderlands,Positive,"im getting into borderlands and i can murder you all,",im getting into borderlands and i can murder you all
6,2402,Borderlands,Positive,So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg,so i spent a few hours making something for fun if you dont know i am a huge borderlands fan and maya is one of my favorite characters so i decided to make myself a wallpaper for my pc here is the original image versus the creation i made enjoy pictwittercommlsi5wf9jg
7,2402,Borderlands,Positive,"So I spent a couple of hours doing something for fun... If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters, I decided to make a wallpaper for my PC.. Here's the original picture compared to the creation I made:) Have fun! pic.twitter.com / mLsI5wf9Jg",so i spent a couple of hours doing something for fun if you dont know that im a huge borderlands fan and maya is one of my favorite characters i decided to make a wallpaper for my pc heres the original picture compared to the creation i made have fun pictwittercom mlsi5wf9jg
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun... If you don't know I'm a HUGE @ Borderlands fan and Maya is one of my favorite characters.,so i spent a few hours doing something for fun if you dont know im a huge borderlands fan and maya is one of my favorite characters
9,2402,Borderlands,Positive,So I spent a few hours making something for fun. . . If you don't know I am a HUGE RhandlerR fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg,so i spent a few hours making something for fun if you dont know i am a huge rhandlerr fan and maya is one of my favorite characters so i decided to make myself a wallpaper for my pc here is the original image versus the creation i made enjoy pictwittercommlsi5wf9jg


In [13]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

val_data.head(20)

Unnamed: 0,id,information,type,text,cleaned
0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣",i mentioned on facebook that i was struggling for motivation to go for a run the other day which has been translated by tom’s great auntie as ‘hayley can’t get out of bed’ and told to his grandma who now thinks i’m a lazy terrible person
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects claims company acted like a 'drug dealer' bbc.co.uk/news/av/busine…,bbc news amazon boss jeff bezos rejects claims company acted like a drug dealer bbccouknewsavbusine…
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it functions so poorly on my @SamsungUS Chromebook? 🙄,microsoft why do i pay for word when it functions so poorly on my samsungus chromebook
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking, it's a truly awful game.",csgo matchmaking is so full of closet hacking its a truly awful game
4,4433,Google,Neutral,Now the President is slapping Americans in the face that he really did commit an unlawful act after his acquittal! From Discover on Google vanityfair.com/news/2020/02/t…,now the president is slapping americans in the face that he really did commit an unlawful act after his acquittal from discover on google vanityfaircomnews202002t…
5,6273,FIFA,Negative,"Hi @EAHelp I’ve had Madeleine McCann in my cellar for the past 13 years and the little sneaky thing just escaped whilst I was loading up some fifa points, she took my card and I’m having to use my paypal account but it isn’t working, can you help me resolve it please?",hi eahelp i’ve had madeleine mccann in my cellar for the past 13 years and the little sneaky thing just escaped whilst i was loading up some fifa points she took my card and i’m having to use my paypal account but it isn’t working can you help me resolve it please
6,7925,MaddenNFL,Positive,Thank you @EAMaddenNFL!! \n\nNew TE Austin Hooper in the ORANGE & BROWN!! \n\n#Browns | @AustinHooper18 \n\n pic.twitter.com/GRg4xzFKOn,thank you eamaddennfl new te austin hooper in the orange brown browns austinhooper18 pictwittercomgrg4xzfkon
7,11332,TomClancysRainbowSix,Positive,"Rocket League, Sea of Thieves or Rainbow Six: Siege🤔? I love playing all three on stream but which is the best? #stream #twitch #RocketLeague #SeaOfThieves #RainbowSixSiege #follow",rocket league sea of thieves or rainbow six siege i love playing all three on stream but which is the best stream twitch rocketleague seaofthieves rainbowsixsiege follow
8,1107,AssassinsCreed,Positive,my ass still knee-deep in Assassins Creed Odyssey with no way out anytime soon lmao,my ass still kneedeep in assassins creed odyssey with no way out anytime soon laugh my off
9,2069,CallOfDuty,Negative,FIX IT JESUS ! Please FIX IT ! What In the world is going on here. @PlayStation @AskPlayStation @Playstationsup @Treyarch @CallofDuty negative 345 silver wolf error code pic.twitter.com/ziRyhrf59Q,fix it jesus please fix it what in the world is going on here playstation askplaystation playstationsup treyarch callofduty negative 345 silver wolf error code pictwittercomziryhrf59q


In [14]:
train_data['type'] = train_data['type'].replace('Irrelevant', 'Neutral')
val_data['type'] = val_data['type'].replace('Irrelevant', 'Neutral')

In [15]:
train_data['type'].value_counts() / len(train_data)

type
Neutral     0.422086
Negative    0.302808
Positive    0.275106
Name: count, dtype: float64

In [16]:
tokens_text = [word_tokenize((word)) for word in train_data['cleaned']]
tokens_counter = [item for sublist in tokens_text for item in sublist]

print(len(set(tokens_counter)))

40119


In [19]:
reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, shuffle=True, random_state=0, stratify=train_data['type'])

bow_count = CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 1))

X_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
X_test_bow = bow_count.transform(reviews_test['cleaned'])
X_val_bow = bow_count.transform(val_data['cleaned'])

y_train = reviews_train['type']
y_test = reviews_test['type']
y_val = val_data['type']

smote = SMOTE(random_state=4, sampling_strategy='not majority')
X_train_sm, y_train_sm = smote.fit_resample(X_train_bow, y_train)



In [26]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1, 4)
)

X_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
X_test_bow = bow_count.transform(reviews_test['cleaned'])
X_val_bow = bow_count.transform(val_data['cleaned'])

y_train = reviews_train['type']
y_test = reviews_test['type']
y_val = val_data['type']

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
y_val_enc = le.transform(y_val)

smote = SMOTE(random_state=4, sampling_strategy='not majority')
X_train_sm, y_train_sm = smote.fit_resample(X_train_bow, y_train_enc)



In [27]:
model1 = LogisticRegression(C=1, solver="liblinear", max_iter=200)
model1.fit(X_train_sm, y_train_sm)

test_pred = model1.predict(X_test_bow)
accuracy_score(y_test_enc, test_pred) * 100

89.55484231091265

In [28]:
val_pred = model1.predict(X_val_bow)
accuracy_score(y_val_enc, val_pred) * 100

98.3

In [38]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1, 4)
)

X_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
X_test_bow = bow_count.transform(reviews_test['cleaned'])
X_val_bow = bow_count.transform(val_data['cleaned'])



In [99]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1, 4)
)

X_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
X_test_bow = bow_count.transform(reviews_test['cleaned'])
X_val_bow = bow_count.transform(val_data['cleaned'])

y_train = reviews_train['type']
y_test = reviews_test['type']
y_val = val_data['type']

smote = SMOTE(random_state=4, sampling_strategy='not majority')
X_train_sm, y_train_sm = smote.fit_resample(X_train_bow, y_train)



In [23]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1, 4)
)

X_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
X_test_bow = bow_count.transform(reviews_test['cleaned'])
X_val_bow = bow_count.transform(val_data['cleaned'])

y_train = reviews_train['type']
y_test = reviews_test['type']
y_val = val_data['type']

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
y_val_enc = le.transform(y_val)

smote = SMOTE(random_state=4, sampling_strategy='not majority')
X_train_sm, y_train_sm = smote.fit_resample(X_train_bow, y_train_enc)

In [100]:
model2 = LogisticRegression(C=0.9, solver="liblinear",max_iter=1500)
model2.fit(X_train_sm, y_train_sm)
test_pred_2 = model2.predict(X_test_bow)
accuracy_score(y_test, test_pred_2) * 100

89.52693273792912

In [101]:
val_pred_2 = model2.predict(X_val_bow)
accuracy_score(y_val, val_pred_2) * 100

98.1

In [None]:
model3 = LinearSVC(C=10, max_iter=2000, dual=False, loss='squared_hinge')
model3.fit(X_train_sm, y_train_sm)
test_pred_3 = model3.predict(X_test_bow)
accuracy_score(y_test_enc, test_pred_3) * 100

In [None]:
val_pred_3 = model3.predict(X_val_bow)
accuracy_score(y_val_enc, val_pred_3) * 100

95.6