In [17]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
import string
import emoji
from nltk.corpus import stopwords
import ssl

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Handle SSL certificate verification
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download ALL necessary resources in one go
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('perluniprops')
nltk.download('universal_tagset')
nltk.download('stopwords')

# Clear the NLTK cache
nltk.data.clear_cache()

[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to C:\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# pip install emoji
# pip install autocorrect

In [20]:
train_data = pd.read_csv('./dataset/twitter_training.csv', header=None)
val_data = pd.read_csv('./dataset/twitter_validation.csv', header=None)

In [21]:
train_data.columns = ['id', 'company', 'type', 'text']
val_data.columns = ['id', 'information', 'type', 'text']
train_data

Unnamed: 0,id,company,type,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [22]:
print(train_data.shape)
print(val_data.shape)

(74682, 4)
(1000, 4)


In [23]:
print(train_data.isnull().sum())
print(val_data.isnull().sum())

id           0
company      0
type         0
text       686
dtype: int64
id             0
information    0
type           0
text           0
dtype: int64


In [24]:
print(train_data.duplicated().sum())
print(val_data.duplicated().sum())

2700
0


In [25]:
train_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)
train_data.shape

(71656, 4)

In [26]:
# Initialize constants
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
HTML_PATTERN = re.compile(r'<.*?>')

# Chat word mappings
CHAT_WORDS = {
    'AFAIK': 'As Far As I Know', 'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back',
    'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before',
    'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed',
    'FWIW': "For What It's Worth", 'FYI': 'For Your Information',
    'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius',
    'IC': 'I See', 'ICQ': 'I Seek you', 'ILU': 'I Love You',
    'IMHO': 'In My Honest Opinion', 'IMO': 'In My Opinion',
    'IOW': 'In Other Words', 'IRL': 'In Real Life',
    'KISS': 'Keep It Simple', 'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My Off', 'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See', 'L8R': 'Later',
    'MTE': 'My Thoughts Exactly', 'M8': 'Mate',
    'NRN': 'No Reply Necessary', 'OIC': 'Oh I See',
    'PITA': 'Pain In The A', 'PRT': 'Party',
    'PRW': 'Parents Are Watching', 'QPSA': 'Que Pasa',
    'ROFL': 'Rolling On Floor Laughing', 'ROFLOL': 'Rolling On Floor Laughing Out Loud',
    'SK8': 'Skate', 'THX': 'Thank You', 'TTFN': 'Ta-Ta For Now',
    'TTYL': 'Talk To You Later', 'U': 'You', 'U2': 'You Too',
    'U4E': 'Yours For Ever', 'WB': 'Welcome Back',
    'WTF': 'What The F', 'WTG': 'Way To Go',
    'WUF': 'Where Are You From', 'W8': 'Wait',
    'TFW': 'That feeling when', 'MFW': 'My face when',
    'MRW': 'My reaction when', 'IFYP': 'I feel your pain',
    'TNTL': 'Trying not to laugh', 'JK': 'Just kidding',
    'IDC': "I don't care", 'ILY': 'I love you',
    'IMU': 'I miss you', 'ADIH': 'Another day in hell',
    'ZZZ': 'Sleeping', 'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes', 'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart', 'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter', 'BFF': 'Best friends forever',
    'CSL': "Can't stop laughing"
}

def preprocess_text(text):
    """
    Comprehensive text preprocessing function that combines all steps
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove emojis using emoji package
    text = emoji.replace_emoji(text, '')
    
    # Remove URLs and HTML
    text = URL_PATTERN.sub('', text)
    text = HTML_PATTERN.sub('', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Split into words and process
    words = text.split()
    
    # Convert chat words
    processed_words = []
    for word in words:
        # Check for chat words
        if word.upper() in CHAT_WORDS:
            word_expanded = CHAT_WORDS[word.upper()]
            # Add words from expanded chat words
            processed_words.extend(word_expanded.lower().split())
        else:
            processed_words.append(word)
    
    return ' '.join(processed_words)

# Apply preprocessing to both datasets
train_data['cleaned'] = train_data['text'].apply(preprocess_text)
val_data['cleaned'] = val_data['text'].apply(preprocess_text)

In [27]:
train_data

Unnamed: 0,id,company,type,text,cleaned
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,just realized that the windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,just realized that my mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,just realized the windows partition of my mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,just realized between the windows partition of...


In [16]:
# pd.set_option('display.max_colwidth', None)  # Show full column width
pd.set_option('display.width', 1000)  # Avoid line breaks

train_data.head(5)

Unnamed: 0,id,company,type,text,cleaned
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...


In [28]:
val_data

Unnamed: 0,id,information,type,text,cleaned
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...,i mentioned on facebook that i was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,microsoft why do i pay for word when it functi...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking is so full of closet hacking ...
4,4433,Google,Neutral,Now the President is slapping Americans in the...,now the president is slapping americans in the...
...,...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...,toronto is the arts and culture capital of can...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,this is actually a good move tot bring more vi...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...,today sucked so it’s tears in my eyes to drink...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.,bought a fraction of microsoft today small wins


In [29]:
# from autocorrect import Speller

# spell = Speller(lang="en")  # English spell checker

# train_data['lower'] = train_data['lower'].apply(lambda text: spell(text))
# val_data['lower'] = val_data['lower'].apply(lambda text: spell(text))

In [30]:
# import spacy
# nlp = spacy.load('en_core_web_sm')

# def lemmatized(text):
#   doc = nlp(text)
#   return ' '.join([token.lemma_ for token in doc])

# train_data['lower'] = train_data['lower'].apply(lambda text: lemmatized(text))
# val_data['lower'] = val_data['lower'].apply(lambda text: lemmatized(text))

In [31]:
tokens_text = [word_tokenize((word)) for word in train_data['cleaned']]
tokens_counter = [item for sublist in tokens_text for item in sublist]

print(len(set(tokens_counter)))

40112


In [32]:
reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, random_state=0)

In [33]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1, 1)
)

X_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
X_test_bow = bow_count.transform(reviews_test['cleaned'])
X_val_bow = bow_count.transform(val_data['cleaned'])



In [34]:
y_train_bow = reviews_train['type']
y_test_bow = reviews_test['type']
y_val_bow = val_data['type']

In [35]:
y_test_bow.value_counts() / y_test_bow.shape[0]

type
Negative      0.300447
Positive      0.273723
Neutral       0.248953
Irrelevant    0.176877
Name: count, dtype: float64

In [36]:
model1 = LogisticRegression(C=1, solver="liblinear", max_iter=200)
model1.fit(X_train_bow, y_train_bow)

test_pred = model1.predict(X_test_bow)
accuracy_score(y_test_bow, test_pred) * 100



81.698297516048

In [37]:
val_pred = model1.predict(X_val_bow)
accuracy_score(y_val_bow, val_pred) * 100

92.9

In [38]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1, 4)
)

X_train_bow = bow_count.fit_transform(reviews_train['cleaned'])
X_test_bow = bow_count.transform(reviews_test['cleaned'])
X_val_bow = bow_count.transform(val_data['cleaned'])



In [39]:
model2 = LogisticRegression(C=0.9, solver="liblinear",max_iter=1500)
model2.fit(X_train_bow, y_train_bow)
test_pred_2 = model2.predict(X_test_bow)
accuracy_score(y_test_bow, test_pred_2) * 100

90.18978509628802

In [40]:
val_pred_2 = model2.predict(X_val_bow)
accuracy_score(y_val_bow, val_pred_2) * 100

98.6

In [56]:
le = LabelEncoder()
y_train_bow_num = le.fit_transform(y_train_bow)
y_test_bow_num = le.transform(y_test_bow)
y_val_bow_num = le.transform(y_val_bow)

In [None]:
RF = RandomForestClassifier()
RF.fit(X_train_bow, y_train_bow_num)

test_pred_2 = RF.predict(X_test_bow)
accuracy_score(y_test_bow_num, test_pred_2) * 100

In [None]:
val_pred_2 = RF.predict(X_val_bow)
accuracy_score(y_val_bow_num, val_pred_2) * 100