1. Data Pre-processing

Read the data

In [3]:
import pandas as pd

df = pd.read_csv('../dataset/amazon_datasets.csv')

df.shape

(4574, 6)

Clean Review

In [4]:
import re
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# General Preprocessing (normalization, spaces)
def basic_preprocess(text):
    if not isinstance(text, str):
        return ""
    text = text.lower() # Convert text to lowercase
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text


# VADER Preprocessing
def preprocess_vader(text):
    if not isinstance(text, str):
        return ""
    return basic_preprocess(text)


# Transformer Models Preprocessing (keep meaningful punctuation)
def preprocess_transformers(text):
    if not isinstance(text, str):
        return ""
    text = basic_preprocess(text)
    text = re.sub(r'[^\w\s,!?]', '', text) # Remove punctuation except meaningful ones
    return text


# Deep Learning Models (CNN, LSTM) Preprocessing (Remove punctuation)
def preprocess_dl_models(text):
    if not isinstance(text, str):
        return ""
    text = basic_preprocess(text)
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text


# SVM Preprocessing (Remove punctuation, and stop words)
def preprocess_svm(text):
    if not isinstance(text, str):
        return ""
    text = basic_preprocess(text)
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words]) # Remove stop words
    return text


# Assign Sentiment on the basis of customer ratings
def assign_sentiment(rating):
    if rating >= 4:
        return 1
    elif rating == 3:
        return 0
    else:
        return -1

In [195]:
# import re
# import nltk
# from nltk.corpus import stopwords
# # nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))

# # General Preprocessing (normalization, spaces)
# def basic_preprocess(text):
#     if not isinstance(text, str):
#         return ""
#     text = text.lower()  # Convert text to lowercase
#     text = re.sub(r'\d+', '', text)  # Remove numbers
#     text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
#     return text

# # Unified Preprocessing (for all models)
# def preprocess_data(text):
#     text = basic_preprocess(text)
#     text = re.sub(r'[^\w\s,!?]', '', text)  # Remove punctuation except meaningful ones
#     return text

# # Assign Sentiment on the basis of customer ratings
# def assign_sentiment(rating):
#     if rating >= 4:
#         return 1
#     elif rating == 3:
#         return 0
#     else:
#         return -1

Applyin Preprocess

In [5]:
df['vader']         =   df['review'].apply(preprocess_vader)
df['transformers']  =   df['review'].apply(preprocess_transformers)
df['cnn_lstm']      =   df['review'].apply(preprocess_dl_models)
df['svm']           =   df['review'].apply(preprocess_svm)
df['sentiment']     =   df['rating'].apply(assign_sentiment)

df.to_csv('clean_datasets.csv', index=False)

# df = df.head(500)
df.head()
# df.shape

Unnamed: 0,ID,product_title,user_name,rating,review,review_date,vader,transformers,cnn_lstm,svm,sentiment
0,1,OnePlus Nord N30,forest,5,I bought this phone at the recommendation of a...,"May 19, 2024",i bought this phone at the recommendation of a...,i bought this phone at the recommendation of a...,i bought this phone at the recommendation of a...,bought phone recommendation friend happy im so...,1
1,2,OnePlus Nord N30,Drew,5,I have this phone for a few months now and for...,"July 26, 2024",i have this phone for a few months now and for...,i have this phone for a few months now and for...,i have this phone for a few months now and for...,phone months price great phone looking somethi...,1
2,3,OnePlus Nord N30,forest,4,I like that this phone has a good battery life...,"January 2, 2024",i like that this phone has a good battery life...,i like that this phone has a good battery life...,i like that this phone has a good battery life...,like phone good battery life charges superfast...,1
3,4,OnePlus Nord N30,Amazon Customer,5,Short version: I got this for my mom since she...,"August 10, 2024",short version: i got this for my mom since she...,short version i got this for my mom since she ...,short version i got this for my mom since she ...,short version got mom since habit buying cheap...,1
4,5,OnePlus Nord N30,C Jack,5,I have a Samsung s22 Ultra. I've been having b...,"May 27, 2024",i have a samsung s ultra. i've been having bat...,i have a samsung s ultra ive been having batte...,i have a samsung s ultra ive been having batte...,samsung ultra ive battery connectivity issues ...,1


In [197]:
# df['clean_review']  =   df['review'].apply(preprocess_data)
# df['sentiment']     =   df['rating'].apply(assign_sentiment)

# df.to_csv('clean_datasets.csv', index=False)

# df = df.head(500)
# df.shape

Split Data Into Train and Test Set

In [198]:
from sklearn.model_selection import train_test_split

X_text_train, X_text_test, X_vader_train, X_vader_test, X_transformers_train, X_transformers_test, X_cnn_lstm_train, X_cnn_lstm_test, X_svm_train, X_svm_test, y_train, y_test = train_test_split(
    df['review'], df['vader'], df['transformers'], df['cnn_lstm'], df['svm'], df['sentiment'], test_size=0.2, random_state=42)

In [199]:
# from sklearn.model_selection import train_test_split

# X = df['clean_review']
# Y = df['sentiment']

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

3. Initializing Models

Transformer-Based Models: ( RoBERTa, BERT )

In [200]:
# import torch
# import torch.nn.functional as F
# from transformers import RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, BertTokenizer

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# def preprocess_transformers(texts, tokenizer, max_length=128):
#     if tokenizer.pad_token is None:
#         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
#     return inputs

# # Load Pre-Train Models
# roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# # Roberta Model
# def roberta_sentiment(text):
#     roberta_inputs = preprocess_transformers([text], roberta_tokenizer)
#     roberta_inputs = {key: val.to(device) for key, val in roberta_inputs.items()}  # Move inputs to device
#     with torch.no_grad():
#         roberta_output = roberta_model(**roberta_inputs).logits
#     probabilities = F.softmax(roberta_output, dim=-1)
#     positive_class_prob = probabilities[0, 1].item()
#     return positive_class_prob


# # Bert Model
# def bert_sentiment(text):
#     bert_inputs = preprocess_transformers([text], bert_tokenizer)
#     bert_inputs = {key: val.to(device) for key, val in bert_inputs.items()}  # Move inputs to device
#     with torch.no_grad():
#         bert_output = bert_model(**bert_inputs).logits
#     probabilities = F.softmax(bert_output, dim=-1)
#     positive_class_prob = probabilities[0, 1].item()
#     return positive_class_prob


# # Roberta Model
# def roberta_sentiment(text):
#     roberta_inputs = preprocess_transformers([text], roberta_tokenizer)
#     roberta_inputs = {key: val.to(device) for key, val in roberta_inputs.items()}  
#     with torch.no_grad():
#         roberta_output = roberta_model(**roberta_inputs).logits
#     probabilities = F.softmax(roberta_output, dim=-1)
#     positive_class_prob = probabilities[0, 1].item()
#     negative_class_prob = probabilities[0, 0].item()
#     neutral_class_prob = probabilities[0, 2].item()
#     compound_score = (positive_class_prob - negative_class_prob) * (1 - neutral_class_prob)
#     return compound_score

# # Bert Model
# def bert_sentiment(text):
#     bert_inputs = preprocess_transformers([text], bert_tokenizer)
#     bert_inputs = {key: val.to(device) for key, val in bert_inputs.items()}
#     with torch.no_grad():
#         bert_output = bert_model(**bert_inputs).logits
#     probabilities = F.softmax(bert_output, dim=-1)
#     positive_class_prob = probabilities[0, 1].item()
#     negative_class_prob = probabilities[0, 0].item()
#     neutral_class_prob = probabilities[0, 2].item()
#     compound_score = (positive_class_prob - negative_class_prob) * (1 - neutral_class_prob)
#     return compound_score

In [201]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification
# from transformers import RobertaForSequenceClassification, BertForSequenceClassification, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def preprocess_transformers(texts, tokenizer, max_length=128):
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    return inputs

# Load Pre-Train Models
roberta_tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
roberta_model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

bert_tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
bert_model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')


# Roberta Model
def roberta_sentiment(text):
    roberta_inputs = preprocess_transformers([text], roberta_tokenizer)
    roberta_inputs = {key: val.to(device) for key, val in roberta_inputs.items()}  
    with torch.no_grad():
        roberta_output = roberta_model(**roberta_inputs).logits
    probabilities = F.softmax(roberta_output, dim=-1)
    positive_class_prob = probabilities[0, 1].item()
    negative_class_prob = probabilities[0, 0].item()
    neutral_class_prob = probabilities[0, 2].item()
    return positive_class_prob - negative_class_prob
    # compound_score = positive_class_prob - negative_class_prob
    # compound_score = (positive_class_prob - negative_class_prob) * (1 - neutral_class_prob)
    # return compound_score


def bert_sentiment(text):
    bert_inputs = preprocess_transformers([text], bert_tokenizer)
    bert_inputs = {key: val.to(device) for key, val in bert_inputs.items()}
    with torch.no_grad():
        bert_output = bert_model(**bert_inputs).logits
    probabilities = F.softmax(bert_output, dim=-1)
    positive_class_prob = probabilities[0, 1].item()
    negative_class_prob = probabilities[0, 0].item()
    neutral_class_prob = probabilities[0, 2].item()
    return positive_class_prob - negative_class_prob
    # compound_score = positive_class_prob - negative_class_prob
    # compound_score = (positive_class_prob - negative_class_prob) * (1 - neutral_class_prob)
    # return compound_score
    



In [202]:
# import torch
# import torch.nn.functional as F
# from transformers import RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, BertForSequenceClassification, GPT2Tokenizer, GPT2Model

# # Load Pre-Trained Models
# roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# gpt2_model = GPT2Model.from_pretrained("gpt2")

# # Function to preprocess data for transformer models (RoBERTa, BERT, GPT-2)
# def preprocess_transformers(texts, tokenizer, max_length=128):
#     if tokenizer.pad_token is None:
#         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
#     return inputs

# # Function to run a sentiment model
# def roberta_sentiment(text):
#     inputs = preprocess_transformers([text], roberta_tokenizer)
#     outputs = roberta_model(**inputs).logits
#     probabilities = F.softmax(outputs, dim=-1)
#     positive_prob = probabilities[0, 1].item()
#     return positive_prob

# def bert_sentiment(text):
#     inputs = preprocess_transformers([text], bert_tokenizer)
#     outputs = bert_model(**inputs).logits
#     probabilities = F.softmax(outputs, dim=-1)
#     positive_prob = probabilities[0, 1].item()
#     return positive_prob


# def gpt2_sentiment(text):
#     inputs = preprocess_transformers([text], gpt2_tokenizer)
#     outputs = gpt2_model(**inputs).last_hidden_state.mean(dim=1)
#     positive_prob = torch.sigmoid(outputs).mean().item()
#     return positive_prob

Lexicon-Based Approaches: ( VADER, TextBlob)

In [203]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

vader_analyzer = SentimentIntensityAnalyzer()

# Vader Model 
def vader_sentiment(text):
    return vader_analyzer.polarity_scores(text)['compound']


# TextBlob Model 
def textblob_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    positive_prob = (polarity + 1) / 2  # Normalize to [0, 1]
    negative_prob = 1 - positive_prob
    return positive_prob - negative_prob
    # return TextBlob(text).sentiment.polarity

# def textblob_sentiment(text):
#     polarity = TextBlob(text).sentiment.polarity
    
#     neutral_prob = 1 - abs(polarity)
    
#     positive_prob = (polarity + 1) / 2  # Convert [-1, 1] polarity to [0, 1]
#     negative_prob = 1 - positive_prob
#     compound_score = (positive_prob - negative_prob) * (1 - neutral_prob)
#     print('textblob', positive_prob, negative_prob, neutral_prob, compound_score )
#     return compound_score

Machine Learning Model: SVM with TF-IDF

In [205]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import accuracy_score
# from sklearn.svm import SVC
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline


# # TF-IDF Method
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_svm_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_svm_test)

# def tfidf_features(texts):
#     return tfidf_vectorizer.transform(texts).toarray()


# # SVM Model with TF-IDF features
# svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(probability=True))
# svm_model.fit(X_train_tfidf, y_train)
# svm_predictions = svm_model.predict_proba(X_test_tfidf)[:, 1]  # Probability of positive class

# def svm_score(tfidf_features):
#     svm_model.predict_proba(tfidf_features)[:, 1]

In [206]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
# import numpy as np

# SVM Model definition using TF-IDF Vectorizer
svm_model = make_pipeline(TfidfVectorizer(max_features=5000), SVC(probability=True))

# Training the SVM model
svm_model.fit(X_svm_train, y_train)

# SVM Sentiment Prediction function
def svm_sentiment(text):
    probabilities = svm_model.predict_proba([text])[0]
    
    negative_prob = probabilities[0]
    neutral_prob = probabilities[1]
    positive_prob = probabilities[2]
    
    
    # compound_score = (positive_prob - negative_prob) * (1 - neutral_prob)
    # print('svm', positive_prob, negative_prob, neutral_prob, compound_score )
    # compound_score = positive_prob - negative_prob
    return positive_prob - negative_prob
    # return compound_score

In [207]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import accuracy_score
# from sklearn.svm import SVC
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline


# # TF-IDF Method
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# def tfidf_features(texts):
#     return tfidf_vectorizer.transform(texts).toarray()


# # SVM Model with TF-IDF features
# svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(probability=True))
# svm_model.fit(X_train_tfidf, Y_train)
# svm_predictions = svm_model.predict_proba(X_test_tfidf)[:, 1]  # Probability of positive class

# def svm_score(tfidf_features):
#     svm_model.predict_proba(tfidf_features)[:, 1]

In [208]:
# from sklearn.svm import SVC
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline


# # Function to preprocess for SVM
# def preprocess_svm(texts, vectorizer):
#     return vectorizer.transform(texts)

# # Train SVM model with TF-IDF vectorizer
# def train_svm(x_data, y_data):
#     vectorizer = TfidfVectorizer(max_features=2000)
#     X_train_tfidf = vectorizer.fit_transform(x_data)
#     svm = SVC(probability=True)
#     svm.fit(X_train_tfidf, y_data)
#     return svm, vectorizer

# def svm_sentiment(text, vectorizer, model):
#     X = preprocess_svm([text], vectorizer)
#     return model.predict_proba(X)[0][1]

Deep Learning Models: (LSTM, CNN)

In [209]:
# import numpy as np
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Dropout

# # lstm Model 
# def build_lstm(vocab_size, embedding_dim=128, max_length=100):
#     model = Sequential()
#     model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
#     model.add(LSTM(128, return_sequences=False))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model


# # CNN Model
# def build_cnn(vocab_size, embedding_dim=128, max_length=100):
#     model = Sequential()
#     model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
#     model.add(Conv1D(128, 5, activation='relu'))
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(Flatten())
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

# # Create Tokenizer for LSTM and CNN
# tokenizer = Tokenizer(num_words=5000)
# tokenizer.fit_on_texts(X_cnn_lstm_train)
# X_train_seq = tokenizer.texts_to_sequences(X_cnn_lstm_train)
# X_test_seq = tokenizer.texts_to_sequences(X_cnn_lstm_test)

# max_length = 100
# X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
# X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# vocab_size = len(tokenizer.word_index) + 1

# # Build and train LSTM and CNN models
# lstm_model = build_lstm(vocab_size, max_length=max_length)
# cnn_model = build_cnn(vocab_size, max_length=max_length)

# lstm_model.fit(X_train_pad, np.array(y_train), epochs=5, batch_size=32, validation_split=0.2)
# cnn_model.fit(X_train_pad, np.array(y_train), epochs=5, batch_size=32, validation_split=0.2)

# # LSTM and CNN Predictions
# lstm_predictions = lstm_model.predict(X_test_pad).flatten()
# cnn_predictions = cnn_model.predict(X_test_pad).flatten()

In [210]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Parameters
vocab_size = 10000  # Vocabulary size
max_length = 100  # Max length for input sequences
embedding_dim = 100  # Embedding vector size

# Tokenization and padding
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_cnn_lstm_train)
X_train_seq = tokenizer.texts_to_sequences(X_cnn_lstm_train)
# X_test_seq = tokenizer.texts_to_sequences(X_cnn_lstm_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
# X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')


# lstm Model 
def build_lstm():
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# Create and train the LSTM model
lstm_model = build_lstm()
lstm_model.fit(X_train_pad, np.array(y_train), epochs=5, batch_size=32, validation_split=0.2)
# lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)

# LSTM Sentiment Prediction
def lstm_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    positive_prob = lstm_model.predict(padded_sequence)[0][0]
    negative_prob = 1 - positive_prob
    return positive_prob - negative_prob



Epoch 1/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 328ms/step - accuracy: 0.8245 - loss: 0.6415 - val_accuracy: 0.7750 - val_loss: 0.6546
Epoch 2/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 137ms/step - accuracy: 0.8079 - loss: 0.6138 - val_accuracy: 0.7750 - val_loss: 0.6164
Epoch 3/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 146ms/step - accuracy: 0.8264 - loss: 0.5660 - val_accuracy: 0.7750 - val_loss: 0.6527
Epoch 4/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - accuracy: 0.8283 - loss: 0.5211 - val_accuracy: 0.7750 - val_loss: 0.6239
Epoch 5/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 143ms/step - accuracy: 0.8159 - loss: 0.5208 - val_accuracy: 0.7750 - val_loss: 0.6079


In [211]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

# CNN Model
def build_cnn():
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create and train the CNN model
cnn_model = build_cnn()
cnn_model.fit(X_train_pad, np.array(y_train), epochs=5, batch_size=32, validation_split=0.2)

# Function for predicting sentiment using CNN
def cnn_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    positive_prob = cnn_model.predict(padded_sequence)[0][0]
    negative_prob = 1 - positive_prob
    return positive_prob - negative_prob

Epoch 1/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 131ms/step - accuracy: 0.7506 - loss: 0.6453 - val_accuracy: 0.7750 - val_loss: 0.6875
Epoch 2/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.8048 - loss: 0.5978 - val_accuracy: 0.7750 - val_loss: 0.6461
Epoch 3/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.8153 - loss: 0.4772 - val_accuracy: 0.7750 - val_loss: 0.6332
Epoch 4/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.8333 - loss: 0.3719 - val_accuracy: 0.7750 - val_loss: 0.6439
Epoch 5/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.8595 - loss: 0.3368 - val_accuracy: 0.7750 - val_loss: 0.6126


In [212]:
# import numpy as np
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Dropout


# # LSTM Model
# def Build_lstm_model(input_length):
#     model = Sequential()
#     model.add(Embedding(MAX_NUM_WORDS, 128, input_length=input_length))
#     model.add(LSTM(128, return_sequences=False))
#     model.add(Dropout(0.2))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model


# def Build_cnn_model(input_length):
#     model = Sequential()
#     model.add(Embedding(input_dim=5000, output_dim=128, input_length=input_length))
#     model.add(Conv1D(64, 5, activation='relu'))
#     model.add(MaxPooling1D(pool_size=4))
#     model.add(Flatten())
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model


# # Tokenizer for LSTM/CNN
# MAX_NUM_WORDS = 20000
# MAX_SEQUENCE_LENGTH = 100


# # Create Tokenizer for LSTM and CNN
# tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# tokenizer.fit_on_texts(X_train)
# X_train_seq = tokenizer.texts_to_sequences(X_train)
# X_test_seq = tokenizer.texts_to_sequences(X_test)

# X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
# X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)


# # LSTM Model training
# lstm_model = Build_lstm_model(input_length=MAX_SEQUENCE_LENGTH)
# lstm_model.fit(X_train_pad, np.array(Y_train), epochs=5, batch_size=32, validation_split=0.2)


# # CNN Model training
# cnn_model = Build_cnn_model(input_length=MAX_SEQUENCE_LENGTH)
# cnn_model.fit(X_train_pad, np.array(Y_train), epochs=5, batch_size=32, validation_split=0.2)


# # LSTM and CNN Predictions
# lstm_predictions = lstm_model.predict(X_test_pad).flatten()
# cnn_predictions = cnn_model.predict(X_test_pad).flatten()

Hybrid Model

In [213]:
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')
# def hybrid_model(texts, vader_texts, transformers_texts, svm_texts):
#     results = []
    
#     for i, text in enumerate(texts):
#         # Vader Sentiment
#         vader_score = vader_sentiment(vader_texts[i])
        
#         # Transformer Models (RoBERTa, BERT)
#         roberta_score = roberta_sentiment(transformers_texts[i])
#         bert_score = bert_sentiment(transformers_texts[i])

#         # SVM Prediction (using TF-IDF)
#         svm_score = svm_sentiment(svm_texts[i])

#         # Additional Models: TextBlob
#         textblob_score = textblob_sentiment(text)

#         # LSTM and CNN Predictions
#         lstm_score = lstm_predictions[i]
#         cnn_score = cnn_predictions[i]

#         # SVM Prediction (using TF-IDF)
#         # tfidf_features = tfidf_vectorizer.transform([svm_texts[i]])
#         # svm_score = svm_model.predict(tfidf_features)[0]
        

#         # POS Tags (custom function for counting nouns and verbs)
#         noun_count, verb_count = pos_tags(text)

#         # Calculate average score (simple hybrid result aggregation)
#         avg_score = np.mean([
#             vader_score, textblob_score,  roberta_score, bert_score, lstm_score, cnn_score, svm_score
#         ])

#         # VADER
#         if vader_score > 0.1:
#             sentiment_vader = "Positive"
#         elif vader_score == -0.1:
#             sentiment_vader = "Neutral"
#         else:
#             sentiment_vader = "Negative"

#         # TextBlob
#         if textblob_score > 0.1:
#             sentiment_textblob = "Positive"
#         elif textblob_score == -0.1:
#             sentiment_textblob = "Neutral"
#         else:
#             sentiment_textblob = "Negative"

#         # RoBERTa
#         if roberta_score > 0.1:
#             sentiment_roberta = "Positive"
#         elif roberta_score < -0.1:
#             sentiment_roberta = "Negative"
#         else:
#             sentiment_roberta = "Neutral"

#         # BERT
#         if bert_score > 0.1:
#             sentiment_bert = "Positive"
#         elif bert_score < -0.1:
#             sentiment_bert = "Negative"
#         else:
#             sentiment_bert = "Neutral"

#         # CNN
#         if cnn_score > 0.55:
#             sentiment_cnn = "Positive"
#         elif cnn_score < 0.45:
#             sentiment_cnn = "Negative"
#         else:
#             sentiment_cnn = "Neutral"

#         # LSTM
#         if lstm_score > 0.55:
#             sentiment_lstm = "Positive"
#         elif lstm_score < 0.45:
#             sentiment_lstm = "Negative"
#         else:
#             sentiment_lstm = "Neutral"
        
#         # SVM
#         if svm_score == 1:
#             sentiment_svm = "Positive"
#         elif svm_score == 0:
#             sentiment_svm = "Negative"
#         else:
#             sentiment_svm = "Neutral"

#         # Average score
#         if avg_score > 0.55:
#             sentiment_avg = "Positive"
#         elif avg_score < 0.45:
#             sentiment_avg = "Negative"
#         else:
#             sentiment_avg = "Neutral"

#         # Append results for this text
#         results.append({
#             'text': text,
#             'vader': vader_score,
#             'textblob': textblob_score,
#             'roberta': roberta_score,
#             'bert': bert_score,
#             'cnn': cnn_score,
#             'lstm': lstm_score,
#             'svm': svm_score,
#             'average_score': avg_score,
#             'sentiment': df['sentiment'][i],
#             'vader_sentiment': sentiment_vader,
#             'textblob_sentiment': sentiment_textblob,
#             'roberta_sentiment': sentiment_roberta,
#             'bert_sentiment': sentiment_bert,
#             'cnn_sentiment': sentiment_cnn,
#             'lstm_sentiment': sentiment_lstm,
#             'svm_sentiment': sentiment_svm,
#             'avg_sentiment': sentiment_avg,
#             'nouncount_': noun_count,
#             'verbcount_': verb_count
#         })
    
#     return results


# df['review'] = df['review'].fillna('').astype(str)
# min_length = min(len(X_text_test), len(X_vader_test), len(X_transformers_test), len(X_cnn_lstm_test), len(X_svm_test))

# texts = X_text_test[:min_length].reset_index(drop=True)
# vader_texts = X_vader_test[:min_length].reset_index(drop=True)
# transformers_texts = X_transformers_test[:min_length].reset_index(drop=True)
# cnn_lstm_texts = X_cnn_lstm_test[:min_length].reset_index(drop=True)
# svm_texts = X_svm_test[:min_length].reset_index(drop=True)

# # Run the hybrid model
# hybrid_results = hybrid_model(texts, vader_texts, transformers_texts, svm_texts)


In [214]:
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')
# def hybrid_model(texts, vader_texts, transformers_texts, svm_texts):
#     results = []
    
#     for i, text in enumerate(texts):
#         # Vader Sentiment
#         vader_score = vader_sentiment(vader_texts[i])
#         textblob_score = textblob_sentiment(text)

#         # Transformer Models (RoBERTa, BERT)
#         roberta_score = roberta_sentiment(transformers_texts[i])
#         bert_score = bert_sentiment(transformers_texts[i])
#         gpt2_score = gpt2_sentiment(transformers_texts[i])

#         # LSTM and CNN Predictions
#         lstm_score = lstm_predictions[i]
#         cnn_score = cnn_predictions[i]

#         # SVM Prediction (using TF-IDF)
#         tfidf_features = tfidf_vectorizer.transform([svm_texts[i]])
#         svm_score = svm_model.predict(tfidf_features)[0]



#         # POS Tags (custom function for counting nouns and verbs)
#         noun_count, verb_count = pos_tags(text)

#         # Calculate average score (simple hybrid result aggregation)
#         avg_score = np.mean([
#             vader_score, textblob_score,  roberta_score, bert_score, lstm_score, cnn_score, svm_score
#         ])

#         # VADER
#         if vader_score > 0:
#             sentiment_vader = "Positive"
#         elif vader_score == 0:
#             sentiment_vader = "Neutral"
#         else:
#             sentiment_vader = "Negative"

#         # TextBlob
#         if textblob_score > 0:
#             sentiment_textblob = "Positive"
#         elif textblob_score == 0:
#             sentiment_textblob = "Neutral"
#         else:
#             sentiment_textblob = "Negative"

#         # Gpt2
#         if gpt2_score > 0.55:
#             sentiment_gpt2 = "Positive"
#         elif gpt2_score < 0.45:
#             sentiment_gpt2 = "Neutral"
#         else:
#             sentiment_gpt2 = "Negative"

#         # RoBERTa
#         if roberta_score > 0.55:
#             sentiment_roberta = "Positive"
#         elif roberta_score < 0.45:
#             sentiment_roberta = "Negative"
#         else:
#             sentiment_roberta = "Neutral"

#         # BERT
#         if bert_score > 0.55:
#             sentiment_bert = "Positive"
#         elif bert_score < 0.45:
#             sentiment_bert = "Negative"
#         else:
#             sentiment_bert = "Neutral"

#         # CNN
#         if cnn_score > 0.55:
#             sentiment_cnn = "Positive"
#         elif cnn_score < 0.45:
#             sentiment_cnn = "Negative"
#         else:
#             sentiment_cnn = "Neutral"

#         # LSTM
#         if lstm_score > 0.55:
#             sentiment_lstm = "Positive"
#         elif lstm_score < 0.45:
#             sentiment_lstm = "Negative"
#         else:
#             sentiment_lstm = "Neutral"
        
#         # SVM
#         if svm_score == 1:
#             sentiment_svm = "Positive"
#         elif svm_score == 0:
#             sentiment_svm = "Negative"
#         else:
#             sentiment_svm = "Neutral"

#         # Average score
#         if avg_score > 0.55:
#             sentiment_avg = "Positive"
#         elif avg_score < 0.45:
#             sentiment_avg = "Negative"
#         else:
#             sentiment_avg = "Neutral"

#         # Append results for this text
#         results.append({
#             'text': text,
#             'vader': vader_score,
#             'textblob': textblob_score,
#             'roberta': roberta_score,
#             'bert': bert_score,
#             'gpt2': gpt2_score,
#             'cnn': cnn_score,
#             'lstm': lstm_score,
#             'svm': svm_score,
#             'average_score': avg_score,
#             'sentiment': df['sentiment'][i],
#             'vader_sentiment': sentiment_vader,
#             'textblob_sentiment': sentiment_textblob,
#             'roberta_sentiment': sentiment_roberta,
#             'bert_sentiment': sentiment_bert,
#             'gpt2_sentiment': sentiment_gpt2,
#             'cnn_sentiment': sentiment_cnn,
#             'lstm_sentiment': sentiment_lstm,
#             'svm_sentiment': sentiment_svm,
#             'avg_sentiment': sentiment_avg,
#             'nouncount_': noun_count,
#             'verbcount_': verb_count
#         })
    
#     return results


# df['review'] = df['review'].fillna('').astype(str)
# min_length = min(len(X_text_test), len(X_vader_test), len(X_transformers_test), len(X_cnn_lstm_test), len(X_svm_test))

# texts = X_text_test[:min_length].reset_index(drop=True)
# vader_texts = X_vader_test[:min_length].reset_index(drop=True)
# transformers_texts = X_transformers_test[:min_length].reset_index(drop=True)
# cnn_lstm_texts = X_cnn_lstm_test[:min_length].reset_index(drop=True)
# svm_texts = X_svm_test[:min_length].reset_index(drop=True)

# # Run the hybrid model
# hybrid_results = hybrid_model(texts, vader_texts, transformers_texts, svm_texts)


In [215]:
def hybrid_model(text, vader_text, transformers_text, cnn_lstm_text, svm_text):
    vader_score = vader_sentiment(vader_text)
    roberta_score = roberta_sentiment(transformers_text)
    bert_score = bert_sentiment(transformers_text)
    textblob_score = textblob_sentiment(text)
    svm_score = svm_sentiment(svm_text)
    cnn_score = cnn_sentiment(cnn_lstm_text)
    lstm_score= lstm_sentiment(cnn_lstm_text)
    
    # Combine the predictions (averaging for simplicity)
    hybrid_score = np.mean([vader_score, roberta_score, bert_score, textblob_score, svm_score])


    # VADER
    if vader_score > 0:
        sentiment_vader = "Positive"
    elif vader_score < 0:
        sentiment_vader = "Negative"
    else:
        sentiment_vader = "Neutral"

    # TextBlob
    if textblob_score > 0:
        sentiment_textblob = "Positive"
    elif textblob_score < 0:
        sentiment_textblob = "Negative"
    else:
        sentiment_textblob = "Neutral"

    # RoBERTa
    if roberta_score > 0:
        sentiment_roberta = "Positive"
    elif roberta_score < 0:
        sentiment_roberta = "Negative"
    else:
        sentiment_roberta = "Neutral"

    # BERT
    if bert_score > 0:
        sentiment_bert = "Positive"
    elif bert_score < 0:
        sentiment_bert = "Negative"
    else:
        sentiment_bert = "Neutral"

    # CNN
    if cnn_score > 0.55:
        sentiment_cnn = "Positive"
    elif cnn_score < 0.45:
        sentiment_cnn = "Negative"
    else:
        sentiment_cnn = "Neutral"

    # LSTM
    if lstm_score > 0.55:
        sentiment_lstm = "Positive"
    elif lstm_score < 0.45:
        sentiment_lstm = "Negative"
    else:
        sentiment_lstm = "Neutral"
    
    # SVM
    if svm_score > 0:
        sentiment_svm = "Positive"
    elif svm_score < 0:
        sentiment_svm = "Negative"
    else:
        sentiment_svm = "Neutral"

    # Average score
    if hybrid_score > 0.1:
        sentiment_hybrid = "Positive"
    elif hybrid_score < -0.1:
        sentiment_hybrid = "Negative"
    else:
        sentiment_hybrid = "Neutral"





    comparison = {
        "text": text,
        "vader_score":  vader_score,
        "textblob_score":  textblob_score,
        "roberta_score":  roberta_score,
        "bert_score":  bert_score,
        "cnn_score":  cnn_score,
        "lstm_score":  lstm_score,
        "svm_score":  svm_score,
        "hybrid_score":  hybrid_score,
        'vader_sentiment': sentiment_vader,
        'textblob_sentiment': sentiment_textblob,
        'roberta_sentiment': sentiment_roberta,
        'bert_sentiment': sentiment_bert,
        'cnn_sentiment': sentiment_cnn,
        'lstm_sentiment': sentiment_lstm,
        'svm_sentiment': sentiment_svm,
        'hybrid_sentiment': sentiment_hybrid,
    }


    return comparison

In [216]:
df['review'] = df['review'].fillna('').astype(str)
min_length = min(len(X_text_test), len(X_vader_test), len(X_transformers_test), len(X_cnn_lstm_test), len(X_svm_test))

texts = X_text_test[:min_length].reset_index(drop=True)
vader_texts = X_vader_test[:min_length].reset_index(drop=True)
transformers_texts = X_transformers_test[:min_length].reset_index(drop=True)
cnn_lstm_texts = X_cnn_lstm_test[:min_length].reset_index(drop=True)
svm_texts = X_svm_test[:min_length].reset_index(drop=True)

results_list = []

# Iterate over your dataset
for i, text in enumerate(texts):
    comparison = hybrid_model(text, vader_texts[i], transformers_texts[i], cnn_lstm_texts[i], svm_texts[i])
    results_list.append(comparison)


results_list = pd.DataFrame(results_list)
results_list.to_csv('sentiment_results.csv', index=False)

print("Results saved to sentiment_results.csv")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step   
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 873ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

Compare models sentiment

In [217]:
# results_list = pd.DataFrame(results_list)
# results_list.to_csv('sentiment_results.csv', index=False)

In [218]:
# results_df = pd.DataFrame(hybrid_results)

# results_df.to_csv('amazon_sentiment_result.csv', index=False)

In [219]:
# import pandas as pd

# df = pd.read_csv('../dataset/amazon_datasets.csv')

# # df = df.head(500)
# df.shape

# import re
# import nltk
# from nltk.corpus import stopwords
# # nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))

# # General Preprocessing (normalization, spaces)
# def basic_preprocess(text):
#     if not isinstance(text, str):
#         return ""
#     text = text.lower()  # Convert text to lowercase
#     text = re.sub(r'\d+', '', text)  # Remove numbers
#     text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
#     return text

# # Unified Preprocessing (for all models)
# def preprocess_data(text):
#     text = basic_preprocess(text)
#     text = re.sub(r'[^\w\s,!?]', '', text)  # Remove punctuation except meaningful ones
#     return text

# # Assign Sentiment on the basis of customer ratings
# def assign_sentiment(rating):
#     if rating >= 4:
#         return 1
#     elif rating == 3:
#         return 0
#     else:
#         return -1
    

# df['clean_review']  =   df['review'].apply(preprocess_data)
# df['sentiment']     =   df['rating'].apply(assign_sentiment)

# df.to_csv('clean_datasets.csv', index=False)

# df = df.head(500)
# df.shape

# from sklearn.model_selection import train_test_split

# X_text = df['review']
# X = df['clean_review']
# Y = df['sentiment']

# X_text_train, X_text_test, X_train, X_test, Y_train, Y_test = train_test_split(X_text, X, Y, test_size=0.2, random_state=32)


# import torch
# import torch.nn.functional as F
# from transformers import RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, BertForSequenceClassification, GPT2Tokenizer, GPT2Model

# # Load Pre-Trained Models
# roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# gpt2_model = GPT2Model.from_pretrained("gpt2")

# # Function to preprocess data for transformer models (RoBERTa, BERT, GPT-2)
# def preprocess_transformers(texts, tokenizer, max_length=128):
#     if tokenizer.pad_token is None:
#         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
#     return inputs

# # Function to run a sentiment model
# def roberta_sentiment(text):
#     inputs = preprocess_transformers([text], roberta_tokenizer)
#     outputs = roberta_model(**inputs).logits
#     probabilities = F.softmax(outputs, dim=-1)
#     positive_prob = probabilities[0, 1].item()
#     return positive_prob

# def bert_sentiment(text):
#     inputs = preprocess_transformers([text], bert_tokenizer)
#     outputs = bert_model(**inputs).logits
#     probabilities = F.softmax(outputs, dim=-1)
#     positive_prob = probabilities[0, 1].item()
#     return positive_prob


# def gpt2_sentiment(text):
#     if len(text.strip()) == 0:  # If the text is empty or just whitespace
#         return 0.0  # Assign a neutral score or some default value
    
#     # Preprocess text
#     inputs = preprocess_transformers([text], gpt2_tokenizer)
    
#     if inputs['input_ids'].shape[1] == 0:  # Check if input tensor is empty
#         return 0.0
#     # inputs = preprocess_transformers([text], gpt2_tokenizer)
#     outputs = gpt2_model(**inputs).last_hidden_state.mean(dim=1)
#     positive_prob = torch.sigmoid(outputs).mean().item()
#     return positive_prob


# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# from textblob import TextBlob

# vader_analyzer = SentimentIntensityAnalyzer()

# # Vader Model 
# def vader_sentiment(text):
#     return vader_analyzer.polarity_scores(text)['compound']

# # TextBlob Model 
# def textblob_sentiment(text):
#     return TextBlob(text).sentiment.polarity


# from nltk import pos_tag

# def pos_tags(text):
#     tags = pos_tag(text.split())
#     noun_count = sum(1 for word, tag in tags if tag.startswith('NN'))
#     verb_count = sum(1 for word, tag in tags if tag.startswith('VB'))
#     return noun_count, verb_count


# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import accuracy_score
# from sklearn.svm import SVC
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline


# # TF-IDF Method
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# def tfidf_features(texts):
#     return tfidf_vectorizer.transform(texts).toarray()


# # SVM Model with TF-IDF features
# svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(probability=True))
# svm_model.fit(X_train_tfidf, Y_train)
# svm_predictions = svm_model.predict_proba(X_test_tfidf)[:, 1]  # Probability of positive class

# def svm_score(tfidf_features):
#     svm_model.predict_proba(tfidf_features)[:, 1]


# import numpy as np
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Dropout


# # LSTM Model
# def Build_lstm_model(input_length):
#     model = Sequential()
#     model.add(Embedding(MAX_NUM_WORDS, 128, input_length=input_length))
#     model.add(LSTM(128, return_sequences=False))
#     model.add(Dropout(0.2))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model


# def Build_cnn_model(input_length):
#     model = Sequential()
#     model.add(Embedding(input_dim=5000, output_dim=128, input_length=input_length))
#     model.add(Conv1D(64, 5, activation='relu'))
#     model.add(MaxPooling1D(pool_size=4))
#     model.add(Flatten())
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model


# # Tokenizer for LSTM/CNN
# MAX_NUM_WORDS = 20000
# MAX_SEQUENCE_LENGTH = 100


# # Create Tokenizer for LSTM and CNN
# tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# tokenizer.fit_on_texts(X_train)
# X_train_seq = tokenizer.texts_to_sequences(X_train)
# X_test_seq = tokenizer.texts_to_sequences(X_test)

# X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
# X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)


# # LSTM Model training
# lstm_model = Build_lstm_model(input_length=MAX_SEQUENCE_LENGTH)
# lstm_model.fit(X_train_pad, np.array(Y_train), epochs=5, batch_size=32, validation_split=0.2)


# # CNN Model training
# cnn_model = Build_cnn_model(input_length=MAX_SEQUENCE_LENGTH)
# cnn_model.fit(X_train_pad, np.array(Y_train), epochs=5, batch_size=32, validation_split=0.2)


# # LSTM and CNN Predictions
# lstm_predictions = lstm_model.predict(X_test_pad).flatten()
# cnn_predictions = cnn_model.predict(X_test_pad).flatten()




# # nltk.download('averaged_perceptron_tagger')
# # nltk.download('averaged_perceptron_tagger_eng')
# def hybrid_model(texts, clean_text):
#     results = []
    
#     for i, text in enumerate(texts):
#         # Vader Sentiment
#         vader_score = vader_sentiment(clean_text[i])
#         textblob_score = textblob_sentiment(text)

#         # Transformer Models (RoBERTa, BERT)
#         roberta_score = roberta_sentiment(clean_text[i])
#         bert_score = bert_sentiment(clean_text[i])
#         gpt2_score = gpt2_sentiment(clean_text[i])

#         # LSTM and CNN Predictions
#         lstm_score = lstm_predictions[i]
#         cnn_score = cnn_predictions[i]

#         # SVM Prediction (using TF-IDF)
#         tfidf_features = tfidf_vectorizer.transform([clean_text[i]])
#         svm_score = svm_model.predict(tfidf_features)[0]



#         # POS Tags (custom function for counting nouns and verbs)
#         noun_count, verb_count = pos_tags(text)

#         # Calculate average score (simple hybrid result aggregation)
#         avg_score = np.mean([
#             vader_score, textblob_score,  roberta_score, bert_score, gpt2_score, lstm_score, cnn_score, svm_score
#         ])

#         # VADER
#         if vader_score > 0:
#             sentiment_vader = "Positive"
#         elif vader_score == 0:
#             sentiment_vader = "Neutral"
#         else:
#             sentiment_vader = "Negative"

#         # TextBlob
#         if textblob_score > 0:
#             sentiment_textblob = "Positive"
#         elif textblob_score == 0:
#             sentiment_textblob = "Neutral"
#         else:
#             sentiment_textblob = "Negative"

#         # Gpt2
#         if gpt2_score > 0.55:
#             sentiment_gpt2 = "Positive"
#         elif gpt2_score < 0.45:
#             sentiment_gpt2 = "Neutral"
#         else:
#             sentiment_gpt2 = "Negative"

#         # RoBERTa
#         if roberta_score > 0.55:
#             sentiment_roberta = "Positive"
#         elif roberta_score < 0.45:
#             sentiment_roberta = "Negative"
#         else:
#             sentiment_roberta = "Neutral"

#         # BERT
#         if bert_score > 0.55:
#             sentiment_bert = "Positive"
#         elif bert_score < 0.45:
#             sentiment_bert = "Negative"
#         else:
#             sentiment_bert = "Neutral"

#         # CNN
#         if cnn_score > 0.55:
#             sentiment_cnn = "Positive"
#         elif cnn_score < 0.45:
#             sentiment_cnn = "Negative"
#         else:
#             sentiment_cnn = "Neutral"

#         # LSTM
#         if lstm_score > 0.55:
#             sentiment_lstm = "Positive"
#         elif lstm_score < 0.45:
#             sentiment_lstm = "Negative"
#         else:
#             sentiment_lstm = "Neutral"
        
#         # SVM
#         if svm_score == 1:
#             sentiment_svm = "Positive"
#         elif svm_score == 0:
#             sentiment_svm = "Negative"
#         else:
#             sentiment_svm = "Neutral"

#         # Average score
#         if avg_score > 0.55:
#             sentiment_avg = "Positive"
#         elif avg_score < 0.45:
#             sentiment_avg = "Negative"
#         else:
#             sentiment_avg = "Neutral"

#         # Append results for this text
#         results.append({
#             'text': text,
#             'clean_text': clean_text[i],
#             'vader': vader_score,
#             'textblob': textblob_score,
#             'roberta': roberta_score,
#             'bert': bert_score,
#             'gpt2': gpt2_score,
#             'cnn': cnn_score,
#             'lstm': lstm_score,
#             'svm': svm_score,
#             'average_score': avg_score,
#             'sentiment': df['sentiment'][i],
#             'vader_sentiment': sentiment_vader,
#             'textblob_sentiment': sentiment_textblob,
#             'roberta_sentiment': sentiment_roberta,
#             'bert_sentiment': sentiment_bert,
#             'gpt2_sentiment': sentiment_gpt2,
#             'cnn_sentiment': sentiment_cnn,
#             'lstm_sentiment': sentiment_lstm,
#             'svm_sentiment': sentiment_svm,
#             'avg_sentiment': sentiment_avg,
#             'nouncount_': noun_count,
#             'verbcount_': verb_count
#         })
    
#     return results


# def clean_texts(texts):
#     return [str(text) if isinstance(text, str) else "" for text in texts]

# df['review'] = df['review'].fillna('').astype(str)
# min_length = min(len(X_text_test), len(X_test))

# texts = X_text_test[:min_length].reset_index(drop=True)
# texts = clean_texts(texts)
# clean_text = X_test[:min_length].reset_index(drop=True)
# # vader_texts = X_vader_test[:min_length].reset_index(drop=True)
# # transformers_texts = X_transformers_test[:min_length].reset_index(drop=True)
# # cnn_lstm_texts = X_cnn_lstm_test[:min_length].reset_index(drop=True)
# # svm_texts = X_svm_test[:min_length].reset_index(drop=True)

# # Run the hybrid model
# hybrid_results = hybrid_model(texts, clean_text)

# results_df = pd.DataFrame(hybrid_results)

# results_df.to_csv('amazon_sentiment_result2.csv', index=False)

In [None]:
# import numpy as np
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# from textblob import TextBlob
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import SVC
# from sklearn.pipeline import make_pipeline
# from transformers import pipeline
# from keras.models import Sequential
# from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

# # Instantiate pre-trained models
# vader_analyzer = SentimentIntensityAnalyzer()  # VADER Model
# roberta_pipeline = pipeline('sentiment-analysis', model='roberta-base')  # RoBERTa Model
# bert_pipeline = pipeline('sentiment-analysis', model='bert-base-uncased')  # BERT Model

# # SVM Model Setup (with TF-IDF)
# svm_model = make_pipeline(TfidfVectorizer(max_features=5000), SVC(probability=True))
# svm_model.fit(X_train_svm, y_train)  # Train the SVM model on your preprocessed data

# # CNN-LSTM model for sentiment analysis
# tokenizer = Tokenizer(num_words=5000)
# tokenizer.fit_on_texts(X_train_cnn_lstm)
# X_train_seq = tokenizer.texts_to_sequences(X_train_cnn_lstm)
# X_train_pad = pad_sequences(X_train_seq, maxlen=100)

# cnn_lstm_model = Sequential()
# cnn_lstm_model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
# cnn_lstm_model.add(Conv1D(128, kernel_size=5, activation='relu'))
# cnn_lstm_model.add(MaxPooling1D(pool_size=2))
# cnn_lstm_model.add(LSTM(128))
# cnn_lstm_model.add(Dense(1, activation='sigmoid'))

# cnn_lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# cnn_lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64)

# # Function Definitions for each model
# def vader_sentiment(text):
#     score = vader_analyzer.polarity_scores(text)
#     return score['compound']  # Returns a score between -1 and 1

# def roberta_sentiment(text):
#     result = roberta_pipeline(text)[0]
#     return result['score'] if result['label'] == 'POSITIVE' else -result['score']

# def bert_sentiment(text):
#     result = bert_pipeline(text)[0]
#     return result['score'] if result['label'] == 'POSITIVE' else -result['score']

# def textblob_sentiment(text):
#     return TextBlob(text).sentiment.polarity

# def svm_sentiment(text):
#     probabilities = svm_model.predict_proba([text])[0]
#     positive_prob = probabilities[1]
#     negative_prob = probabilities[0]
#     return (positive_prob - negative_prob)

# def cnn_lstm_sentiment(text):
#     text_seq = tokenizer.texts_to_sequences([text])
#     text_pad = pad_sequences(text_seq, maxlen=100)
#     score = cnn_lstm_model.predict(text_pad)[0][0]
#     return score * 2 - 1  # Scale between -1 and 1

# # Hybrid Model - Combining all sentiment results
# def hybrid_sentiment(text):
#     # Get scores from each model
#     scores = [
#         vader_sentiment(text),
#         roberta_sentiment(text),
#         bert_sentiment(text),
#         textblob_sentiment(text),
#         svm_sentiment(text),
#         cnn_lstm_sentiment(text)
#     ]
#     # Averaging scores from all models
#     return np.mean(scores)

# # Example usage of hybrid model
# sample_text = "I absolutely love this product! It exceeded my expectations."
# hybrid_score = hybrid_sentiment(sample_text)
# print(f"Hybrid Sentiment Score for the text: {hybrid_score}")