# Import Library

In [1]:
import re
import string
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from wordcloud import WordCloud, STOPWORDS

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional

2024-07-27 06:40:41.002541: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-27 06:40:41.002648: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-27 06:40:41.181464: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

  pid, fd = os.forkpty()


Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

# Read Data

In [4]:
df = pd.read_csv('/kaggle/input/quora-dataset/dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    15000 non-null  object
 1   sentiment  15000 non-null  object
dtypes: object(2)
memory usage: 234.5+ KB


# Text Preprocessing

In [5]:
def cleaningText(text):
    text = re.sub(r'[0-9]+', '', text) # remove numbers
    text = re.sub(r'[^\w\s]', '', text) # remove numbers
    text = text.replace('\n', ' ') # replace new line into space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
    text = text.strip(' ') # remove characters space from both left and right text
    return text

def casefoldingText(text): # Converting all the characters in a text into lower case
    text = text.lower()
    return text

def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
    text = word_tokenize(text)
    return text

def filteringText(text): # Remove stopwors in a text
    listStopwords = set(stopwords.words('english'))
    stop = ['app', 'quora']
    listStopwords.update(stop)
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(word) for word in text]
    return lemma_words

In [6]:
def preprocess(text):
    text = cleaningText(text)
    text = casefoldingText(text)
    text = tokenizingText(text)
#     text = filteringText(text)
    text = lemmatization(text)
    return text

In [7]:
df['text'] = df['content'].apply(preprocess)

In [8]:
df.head()

Unnamed: 0,content,sentiment,text
0,"Beside being a yappy left leaning app, it used...",negative,"[beside, being, a, yappy, left, leaning, app, ..."
1,This is one of the best apps i've ever used in...,positive,"[this, is, one, of, the, best, apps, ive, ever..."
2,After the recent update the feeds are not load...,negative,"[after, the, recent, update, the, feed, are, n..."
3,"The Quora as a social network is great,but the...",negative,"[the, quora, a, a, social, network, is, greatb..."
4,Please improve the user experience. It's prett...,negative,"[please, improve, the, user, experience, it, p..."


# Traditional

## Feature Extraction

In [9]:
corpus = df['text'].apply(lambda x: ' '.join(x))
trainx, testx, y_train_vec, y_test_vec = train_test_split(corpus, df['sentiment'], test_size=0.2, random_state=507)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=2000)
X_train_vec = vectorizer.fit_transform(trainx)
X_test_vec = vectorizer.transform(testx)

## Model

In [11]:
mnb = MultinomialNB(alpha=0.5)
mnb.fit(X_train_vec, y_train_vec)

mnb_train_acc = accuracy_score(y_train_vec, mnb.predict(X_train_vec))
mnb_test_acc = accuracy_score(y_test_vec, mnb.predict(X_test_vec))

print("MNB Train Accuracy:", mnb_train_acc)
print("MNB Test Accuracy:", mnb_test_acc)

MNB Train Accuracy: 0.8636666666666667
MNB Test Accuracy: 0.8663333333333333


In [12]:
svm = SVC(C=0.1, kernel='linear')
svm.fit(X_train_vec, y_train_vec)

svm_train_acc = accuracy_score(y_train_vec, svm.predict(X_train_vec))
svm_test_acc = accuracy_score(y_test_vec, svm.predict(X_test_vec))

print("SVM Train Accuracy:", svm_train_acc)
print("SVM Test Accuracy:", svm_test_acc)

SVM Train Accuracy: 0.8703333333333333
SVM Test Accuracy: 0.8746666666666667


In [13]:
from sklearn.metrics import confusion_matrix

svm_cm = confusion_matrix(y_test_vec, svm.predict(X_test_vec))
mnb_cm = confusion_matrix(y_test_vec, mnb.predict(X_test_vec))

print("SVM Confusion Matrix:\n", svm_cm)
print("MultinomialNB Confusion Matrix:\n", mnb_cm)

SVM Confusion Matrix:
 [[1744    0   44]
 [ 119    0   42]
 [ 171    0  880]]
MultinomialNB Confusion Matrix:
 [[1708    0   80]
 [ 108    1   52]
 [ 160    1  890]]


# Deep Learning

## Featrue Extraction

In [14]:
from gensim.models import Word2Vec

X = df['text'].values

#Dimension of vectors we are generating
EMBEDDING_DIM = 100

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = Word2Vec(sentences=X, vector_size=EMBEDDING_DIM, window=5, min_count=1)

len(w2v_model.wv.index_to_key)

16438

In [15]:
w2v_weights = w2v_model.wv.vectors
vocab_size, embedding_size = w2v_weights.shape

In [16]:
w2v_weights.shape

(16438, 100)

In [17]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=None)

tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

In [18]:
maxlen = 100

#Making all news of size maxlen defined above
X = pad_sequences(X, padding='pre', maxlen=maxlen)

In [19]:
#Train test split
y = pd.get_dummies(df['sentiment']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

## LSTM

In [20]:
# Define parameters
drop = 0.5

#Defining Neural Network
model1 = Sequential()
model1.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[w2v_weights], input_length=maxlen))

# Last LSTM layer without return_sequences
model1.add(LSTM(100, return_sequences=False, dropout=0.3, recurrent_dropout=0.3))

# model1.add(Dense(512, activation='relu'))
# model1.add(Dropout(drop))

model1.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model1.add(Dropout(drop))

# model1.add(Dense(64, activation='relu', kernel_regularizer='l2'))
# model1.add(Dropout(drop))

model1.add(Dense(32, activation='relu', kernel_regularizer='l2'))
model1.add(Dropout(drop))

model1.add(Dense(3, activation='softmax'))

model1.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model1.summary()



In [21]:
history1 = model1.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=64,
                    validation_data=(X_test, y_test),
                    verbose=1
                    )

Epoch 1/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 167ms/step - acc: 0.5463 - loss: 1.9862 - val_acc: 0.7157 - val_loss: 0.9166
Epoch 2/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 163ms/step - acc: 0.6968 - loss: 0.8870 - val_acc: 0.8203 - val_loss: 0.5757
Epoch 3/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 164ms/step - acc: 0.8179 - loss: 0.6069 - val_acc: 0.8483 - val_loss: 0.5125
Epoch 4/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 161ms/step - acc: 0.8591 - loss: 0.4871 - val_acc: 0.8730 - val_loss: 0.3982
Epoch 5/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 160ms/step - acc: 0.8885 - loss: 0.4028 - val_acc: 0.8807 - val_loss: 0.3953
Epoch 6/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 162ms/step - acc: 0.9039 - loss: 0.3410 - val_acc: 0.8717 - val_loss: 0.4019
Epoch 7/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

## Bi-LSTM

In [42]:
model2 = Sequential()
drop = 0.8

model2.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_size,
                    weights=[w2v_weights],
                    input_length=maxlen))

# model2.add(Bidirectional(LSTM(50, return_sequences=True)))
model2.add(Bidirectional(LSTM(50)))

# model2.add(Dense(512, activation='relu', kernel_regularizer='l2'))
# model2.add(Dropout(drop))

model2.add(Dense(64, activation='relu', kernel_regularizer='l2'))
model2.add(Dropout(drop))

model2.add(Dense(3, activation='softmax'))

model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model2.summary()

In [43]:
history2 = model2.fit(X_train, y_train,
                     epochs=10,
                     batch_size=64,
                     validation_data=(X_test, y_test),
                     verbose=1)

Epoch 1/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.5661 - loss: 1.4642 - val_accuracy: 0.7820 - val_loss: 0.7730
Epoch 2/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7842 - loss: 0.7490 - val_accuracy: 0.8493 - val_loss: 0.5071
Epoch 3/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8647 - loss: 0.4946 - val_accuracy: 0.8763 - val_loss: 0.4028
Epoch 4/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8981 - loss: 0.3766 - val_accuracy: 0.8787 - val_loss: 0.3964
Epoch 5/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9133 - loss: 0.3171 - val_accuracy: 0.8820 - val_loss: 0.4149
Epoch 6/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9224 - loss: 0.2663 - val_accuracy: 0.8823 - val_loss: 0.4491
Epoch 7/10
[1m188/188

# Inference

In [36]:
text = 'quora is a remarkable platform that has carved a unique niche in the world of online information sharing and community engagement'
tfidf = vectorizer.transform([text])
w2v = tokenizer.texts_to_sequences([text])
w2v = pad_sequences(w2v, padding='pre', maxlen=maxlen)

In [40]:
print("MultinomialNB Prediction:", mnb.predict(tfidf))
print("SVM Prediction:", svm.predict(tfidf))

labels = ['negative', 'neutral', 'positive']
idx1 = np.argmax(model1.predict(w2v))
idx2 = np.argmax(model2.predict(w2v))

print("LSTM Prediction:", labels[idx1])
print("Bi-LSTM Prediction:", labels[idx2])

MultinomialNB Prediction: ['positive']
SVM Prediction: ['positive']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 258ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
LSTM Prediction: positive
Bi-LSTM Prediction: positive
