In [36]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
import string
import emoji
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
import nltk
import ssl

# Handle SSL certificate verification
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download ALL necessary resources in one go
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('perluniprops')
nltk.download('universal_tagset')
nltk.download('stopwords')

# Clear the NLTK cache
nltk.data.clear_cache()

[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to C:\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# pip install emoji
# pip install autocorrect

In [26]:
train_data = pd.read_csv('./dataset/twitter_training.csv', header=None)
val_data = pd.read_csv('./dataset/twitter_validation.csv', header=None)

In [27]:
train_data.columns = ['id', 'information', 'type', 'text']
val_data.columns = ['id', 'information', 'type', 'text']
train_data

Unnamed: 0,id,information,type,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [29]:
train_data.shape

(74682, 4)

In [30]:
train_data.isnull().sum()

id               0
information      0
type             0
text           686
dtype: int64

In [31]:
val_data.isnull().sum()

id             0
information    0
type           0
text           0
dtype: int64

In [32]:
train_data.duplicated().sum()

2700

In [33]:
val_data.duplicated().sum()

0

In [34]:
train_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)
train_data.shape

(71656, 4)

In [14]:
train_data['lower'] = train_data['text'].str.lower()
train_data['lower'] = [str(data) for data in train_data['lower']]

val_data['lower'] = val_data['text'].str.lower()
val_data['lower'] = [str(data) for data in val_data['lower']]

In [15]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_data['lower'] = train_data['lower'].apply(lambda text: remove_punctuation(text))
val_data['lower'] = val_data['lower'].apply(lambda text: remove_punctuation(text))

In [16]:
from nltk.corpus import stopwords

STOP_WORDS = set(stopwords.words('english'))

def remove_stop_words(text):
  return " ".join([word for word in str(text).split() if word not in STOP_WORDS])

train_data['lower'] = train_data['lower'].apply(lambda text: remove_stop_words(text))
val_data['lower'] = val_data['lower'].apply(lambda text: remove_stop_words(text))

In [17]:
import re

def remove_emoji(text):
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', text)

train_data['lower'] = train_data['lower'].apply(lambda text: remove_emoji(text))
val_data['lower'] = val_data['lower'].apply(lambda text: remove_emoji(text))

In [18]:
def remove_urls(text):
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)

train_data['lower'] = train_data['lower'].apply(lambda text: remove_urls(text))
val_data['lower'] = val_data['lower'].apply(lambda text: remove_urls(text))

In [19]:
def remove_html(text):
  html_pattern = re.compile('<.*?>')
  return html_pattern.sub(r'', text)

train_data['lower'] = train_data['lower'].apply(lambda text: remove_html(text))
val_data['lower'] = val_data['lower'].apply(lambda text: remove_html(text))

In [20]:
chat_word = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': "For What It's Worth",
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you (also a chat program)',
    'ILU': 'ILU: I Love You',
    'IMHO': 'In My Honest/Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My A.. Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'PITA': 'Pain In The A..',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My A.. Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The F...',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait...',
    '7K': 'Sick:-D Laugher',
    'TFW': 'That feeling when',
    'MFW': 'My face when',
    'MRW': 'My reaction when',
    'IFYP': 'I feel your pain',
    'TNTL': 'Trying not to laugh',
    'JK': 'Just kidding',
    'IDC': "I don't care",
    'ILY': 'I love you',
    'IMU': 'I miss you',
    'ADIH': 'Another day in hell',
    'ZZZ': 'Sleeping, bored, tired',
    'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes',
    'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart',
    'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter',
    'BFF': 'Best friends forever',
    'CSL': "Can't stop laughing"
}

In [21]:
def short_conv(text):
  new_text= []
  for w in text.split():
    if w.upper() in chat_word:
      new_text.append(chat_word[w.upper()])
    else:
      new_text.append(w)
  return ' '.join(new_text)

train_data['lower'] = train_data['lower'].apply(lambda text: short_conv(text))
val_data['lower'] = val_data['lower'].apply(lambda text: short_conv(text))

NameError: name 'stopwords' is not defined

In [22]:
# from autocorrect import Speller

# spell = Speller(lang="en")  # English spell checker

# train_data['lower'] = train_data['lower'].apply(lambda text: spell(text))
# val_data['lower'] = val_data['lower'].apply(lambda text: spell(text))

In [23]:
# import spacy
# nlp = spacy.load('en_core_web_sm')

# def lemmatized(text):
#   doc = nlp(text)
#   return ' '.join([token.lemma_ for token in doc])

# train_data['lower'] = train_data['lower'].apply(lambda text: lemmatized(text))
train_data.head(20)

Unnamed: 0,id,information,type,text,lower
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,coming borders kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...,spent hours making something fun dont know hug...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...,spent couple hours something fun dont know im ...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...,spent hours something fun dont know im huge bo...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...,spent hours making something fun dont know hug...


In [24]:
# train_data['lower'] = train_data['text'].str.lower()
# train_data['lower'] = [str(data) for data in train_data['lower']]
# train_data['lower'] = train_data['lower'].apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x))

# train_data

In [25]:
# val_data['lower'] = val_data['lower'].apply(lambda text: lemmatized(text))
val_data.head(20)

Unnamed: 0,id,information,type,text,lower
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...,mentioned facebook struggling motivation go ru...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,microsoft pay word functions poorly samsungus ...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking full closet hacking truly awf...
4,4433,Google,Neutral,Now the President is slapping Americans in the...,president slapping americans face really commi...
5,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,hi eahelp i’ve madeleine mccann cellar past 13...
6,7925,MaddenNFL,Positive,Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...,thank eamaddennfl new te austin hooper orange ...
7,11332,TomClancysRainbowSix,Positive,"Rocket League, Sea of Thieves or Rainbow Six: ...",rocket league sea thieves rainbow six siege🤔 l...
8,1107,AssassinsCreed,Positive,my ass still knee-deep in Assassins Creed Odys...,ass still kneedeep assassins creed odyssey way...
9,2069,CallOfDuty,Negative,FIX IT JESUS ! Please FIX IT ! What In the wor...,fix jesus please fix world going playstation a...


In [None]:
# val_data['lower'] = val_data['text'].str.lower()
# val_data['lower'] = [str(data) for data in val_data['lower']]
# val_data['lower'] = val_data['lower'].apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x))

# val_data

Unnamed: 0,id,information,type,text,lower
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...,i mentioned on facebook that i was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,microsoft why do i pay for word when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking is so full of closet hacking ...
4,4433,Google,Neutral,Now the President is slapping Americans in the...,now the president is slapping americans in the...
...,...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...,toronto is the arts and culture capital of c...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,this is actually a good move tot bring more vi...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...,today sucked so it s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.,bought a fraction of microsoft today small wins


In [26]:
tokens_text = [word_tokenize(str(word)) for word in train_data['lower']]
tokens_counter = [item for sublist in tokens_text for item in sublist]

print(len(set(tokens_counter)))

41730


In [27]:
stopwords_nltk = nltk.corpus.stopwords
stop_words = stopwords_nltk.words('english')

In [28]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    stop_words=stop_words,
    ngram_range=(1, 1)
)

In [29]:
reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, random_state=0)

In [30]:
X_train_bow = bow_count.fit_transform(reviews_train['lower'])
X_test_bow = bow_count.transform(reviews_test['lower'])



In [31]:
y_train_bow = reviews_train['type']
y_test_bow = reviews_test['type']

In [32]:
y_test_bow.value_counts() / y_test_bow.shape[0]

type
Negative      0.300447
Positive      0.273723
Neutral       0.248953
Irrelevant    0.176877
Name: count, dtype: float64

In [33]:
model1 = LogisticRegression(C=1, solver="liblinear", max_iter=200)
model1.fit(X_train_bow, y_train_bow)

test_pred = model1.predict(X_test_bow)
accuracy_score(y_test_bow, test_pred) * 100

81.81691320122802

In [49]:
X_val_bow = bow_count.transform(val_data['lower'])
y_val_bow = val_data['type']

In [35]:
val_pred = model1.predict(X_val_bow)
accuracy_score(y_val_bow, val_pred) * 100

92.7

In [50]:
bow_count = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1, 4)
)

X_train_bow = bow_count.fit_transform(reviews_train['lower'])
X_test_bow = bow_count.transform(reviews_test['lower'])
X_val_bow = bow_count.transform(val_data['lower'])



In [51]:
model2 = LogisticRegression(C=1.5, solver="saga", max_iter=1000)
model2.fit(X_train_bow, y_train_bow)
test_pred_2 = model2.predict(X_test_bow)
accuracy_score(y_test_bow, test_pred_2) * 100



78.57242534189227

In [52]:
val_pred_2 = model2.predict(X_val_bow)
accuracy_score(y_val_bow, val_pred_2) * 100

91.60000000000001

In [17]:
le = LabelEncoder()
y_train_bow_num = le.fit_transform(y_train_bow)
y_test_bow_num = le.transform(y_test_bow)
y_val_bow_num = le.transform(y_val_bow)

In [None]:
RF = RandomForestClassifier()
RF.fit(X_train_bow, y_train_bow_num)

test_pred_2 = RF.predict(X_test_bow)
accuracy_score(y_test_bow_num, test_pred_2) * 100

In [None]:
val_pred_2 = RF.predict(X_val_bow)
accuracy_score(y_val_bow_num, val_pred_2) * 100