In [45]:
# Import libraries

import numpy as np
import pandas as pd
import ssl
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from autocorrect import Speller
import spacy
from sklearn.preprocessing import OneHotEncoder
import gensim
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Handle SSL

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('perluniprops')
nltk.download('universal_tagset')
nltk.download('stopwords')

nltk.data.clear_cache()

[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to C:\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Read the Twitter dataset

twitter_df = pd.read_csv('./dataset/twitter_training.csv')
twitter_df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [7]:
# Read the IMDB dataset

dff = pd.read_csv('./imdb_data/IMDB Dataset.csv')
dff.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
# Accessing a specific review and converting to lowercase

dff['review'][100].lower()

"this short film that inspired the soon-to-be full length feature - spatula madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. the short film stars edward the spatula who after being fired from his job, joins in the fight against the evil spoons. this premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. this film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. unfortunately not much else lies below it. the plot that is set up doesn't really have time to show. but it's surely follows it plot better than many high-budget hollywood films. this film is worth watching at least a few times. take it for what it is, and don't expect a deep story."

In [9]:
# Convert all reviews in the dataset to lowercase

dff['review'] = dff['review'].str.lower()

In [10]:
# Define function to remove HTML tags from text using regex

import re

def remove_html_tags(text):
  pattern= re.compile('<.*?>')
  return pattern.sub('', text)

In [11]:
# Apply HTML tag removal to reviews and display results

dff['review'] = dff['review'].apply(remove_html_tags)
dff.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [12]:
# Dictionary of chat/internet slang abbreviations and their meanings

chat_word = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': "For What It's Worth",
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you (also a chat program)',
    'ILU': 'ILU: I Love You',
    'IMHO': 'In My Honest/Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My A.. Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'PITA': 'Pain In The A..',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My A.. Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The F...',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait...',
    '7K': 'Sick:-D Laugher',
    'TFW': 'That feeling when',
    'MFW': 'My face when',
    'MRW': 'My reaction when',
    'IFYP': 'I feel your pain',
    'TNTL': 'Trying not to laugh',
    'JK': 'Just kidding',
    'IDC': "I don't care",
    'ILY': 'I love you',
    'IMU': 'I miss you',
    'ADIH': 'Another day in hell',
    'ZZZ': 'Sleeping, bored, tired',
    'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes',
    'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart',
    'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter',
    'BFF': 'Best friends forever',
    'CSL': "Can't stop laughing"
}

In [13]:
# Function to convert chat abbreviations to full text

def short_conv(text):
  new_text= []
  for w in text.split():
    if w.upper() in chat_word:
      new_text.append(chat_word[w.upper()])
    else:
      new_text.append(w)
  return ' '.join(new_text)

# Test the chat conversion function
short_conv("ASAP let me know please")

'As Soon As Possible let me know please'

In [None]:
# Initialize and test the spell checker

spell = Speller(lang="en")  # English spell checker
text = " ceertainli I dont kniw what is wrrong herre"
corrected_text = spell(text)

print(corrected_text)

 certainly I dont know what is wrong here


In [18]:
# Create function to remove them

STOPWORDS = set(stopwords.words('english'))
def remove_stop_words(text):
  return ' '.join([word for word in str(text).split() if word not in STOPWORDS])

In [19]:
# Test the stopwords removal function

text="I wasn't sure that this might happened"
remove_stop_words(text)

'I sure might happened'

In [21]:
# Using spaCy for lemmatization

nlp = spacy.load('en_core_web_sm')

# Process example sentence and show lemmatization results
sentence = "The children were playing in the park, running and laughing as they enjoyed their freedom, unaware of the time passing quickly by."
doc = nlp(sentence)

for toke in doc:
  print(f'{toke.text} = {toke.lemma_}')

The = the
children = child
were = be
playing = play
in = in
the = the
park = park
, = ,
running = run
and = and
laughing = laugh
as = as
they = they
enjoyed = enjoy
their = their
freedom = freedom
, = ,
unaware = unaware
of = of
the = the
time = time
passing = pass
quickly = quickly
by = by
. = .


In [24]:
# Example of one-hot encoding with a simple dataset

data = { 'ID':[1,2,3,4,5], 'Color':['red','green','blue','green','blue']}
data_df = pd.DataFrame(data)

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data_df[['Color']])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns= encoder.get_feature_names_out(['Color']))
final_df = pd.concat([data_df, encoded_df], axis=1)
final_df

Unnamed: 0,ID,Color,Color_blue,Color_green,Color_red
0,1,red,0.0,0.0,1.0
1,2,green,0.0,1.0,0.0
2,3,blue,1.0,0.0,0.0
3,4,green,0.0,1.0,0.0
4,5,blue,1.0,0.0,0.0


In [27]:
# Load pre-trained GloVe embeddings

glove_vectors = gensim.downloader.load('glove-twitter-25')

In [28]:
# Example of accessing word vectors

glove_vectors['books']

array([ 0.64268  ,  0.045608 ,  1.0344   , -0.2208   ,  0.73695  ,
       -0.83979  ,  1.3606   , -1.417    ,  0.02012  , -0.91255  ,
        0.11603  ,  0.24853  , -3.7822   , -0.21286  , -0.13444  ,
       -0.1682   ,  0.70644  ,  0.10234  ,  0.42941  ,  0.21326  ,
       -0.83451  , -1.1294   , -1.0398   ,  0.25531  ,  0.0081801],
      dtype=float32)

In [29]:
# Find similar words using word embeddings

glove_vectors.most_similar('books')

[('book', 0.94181889295578),
 ('stories', 0.9077752828598022),
 ('added', 0.8998989462852478),
 ('script', 0.8935744762420654),
 ('reference', 0.8861762285232544),
 ('feature', 0.8841565251350403),
 ('shared', 0.8795639276504517),
 ('included', 0.8661485910415649),
 ('features', 0.8640419840812683),
 ('reading', 0.860645592212677)]

In [30]:
# Find word that doesn't match in the group

glove_vectors.doesnt_match(['book', 'teacher', 'school', 'woman'])

'woman'

In [31]:
# Calculate similarity between two words

glove_vectors.similarity('dog', 'cat')

0.959082

In [32]:
# Prepare dataset for modeling by taking a subset

full_df = dff.iloc[:15000]
full_df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [33]:
# Check size of the dataset

full_df.size

30000

In [34]:
# Check for duplicate entries

full_df.duplicated().sum()

39

In [35]:
# Remove duplicates from the dataset

df = full_df.drop_duplicates()
df.shape

(14961, 2)

In [37]:
sw_list = stopwords.words('english')

# Comprehensive text preprocessing function
def preprocess_text(text):
    """
    Preprocess text by removing HTML tags, stopwords, and converting to lowercase
    """
    # Remove HTML tags
    text = re.sub(re.compile('<.*?>'), '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords and join words
    return ' '.join(word for word in text.split() if word not in sw_list)

# Apply preprocessing to all reviews
df['review'] = df['review'].apply(preprocess_text)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(preprocess_text)


Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
14995,bobcat goldthwait commended attempting somethi...,negative
14996,"since days ""clarissa explains all"" bit crush m...",positive
14997,traveling couple (horton hamilton)stumble onto...,negative
14998,film deeply disappointing. wenders displays li...,negative


In [38]:
# Prepare features (X) and target variable (y)

X = df.iloc[:,0:1]
y = df['sentiment']

In [40]:
# Convert sentiment labels to numerical values

encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [42]:
# Split data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
X_train.shape

(11968, 1)

In [44]:
# Convert text to Bag of Words representation

cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

X_train_bow.shape

(11968, 57328)

In [None]:
# Train and predict using Naive Bayes classifier

gnb = GaussianNB()
gnb.fit(X_train_bow, y_train)
y_pred_gnb = gnb.predict(X_test_bow)
gnb_acc = accuracy_score(y_test, y_pred_gnb) * 100
gnb_f1 = f1_score(y_test, y_pred_gnb, average='weighted') * 100

print(f"Accuracy Score: {gnb_acc:.4f}")
print(f"F1 Score: {gnb_f1:.4f}")

In [None]:
svc = SVC(kernel='rbf', C=40, gamma='scale',class_weight='balanced' , random_state=4)
svc.fit(x_train_pca, y_train_encoded)

y_pred_svc = svc.predict(x_test_pca)
svc_acc = accuracy_score(y_test_encoded, y_pred_svc) * 100
f1_svc = f1_score(y_test_encoded, y_pred_svc, average='weighted') * 100

print(f"Accuracy Score: {svc_acc:.4f}")
print(f"F1 Score: {f1_svc:.4f}")

In [38]:
# Calculate accuracy and confusion matrix of Naive Bayes model
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy_score(y_pred, y_test)
confusion_matrix(y_test, y_pred)

array([[1144,  359],
       [ 651,  839]], dtype=int64)

In [39]:
# Train and evaluate Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

0.8499832943534915

In [40]:
# Try Bag of Words with limited features and n-grams
cv = CountVectorizer(ngram_range=(1,2), max_features=5000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

# Train and evaluate Random Forest with new features
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8422986969595724

In [41]:
# Convert text to TF-IDF representation
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

In [42]:
# Train and evaluate Random Forest with TF-IDF features
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.8459739391914467

In [80]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 4.2 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 3.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 3.5 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Train and evaluate LightGBM classifier
import lightgbm

lgbm = lightgbm.LGBMClassifier()

lgbm.fit(X_train_bow,y_train)
y_pred = lgbm.predict(X_test_bow)

accuracy_score(y_test,y_pred)

[LightGBM] [Info] Number of positive: 5890, number of negative: 6078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20112
[LightGBM] [Info] Number of data points in the train set: 11968, number of used features: 4992
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492146 -> initscore=-0.031420
[LightGBM] [Info] Start training from score -0.031420


0.868025392582693

In [43]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [44]:
model.build_vocab(story)
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(8821390, 9295445)

In [45]:
len(model.wv.index_to_key)

38121

In [46]:
def document_vector(doc):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

document_vector(df['review'].value[0])

AttributeError: 'Series' object has no attribute 'value'