# Classical Sentiment Analysis

In [1]:
import numpy as np
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score

## Naive Bayes

#### Using Bag-O-Words

In [2]:
# Load the dataset
train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
train_dataset = pd.DataFrame({"Sentence": train_dataset.iloc[:, 0].to_list(), "Type": train_dataset.iloc[:, 1].to_list()})
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
test_dataset = pd.DataFrame({"Sentence": test_dataset.iloc[:, 0].to_list(), "Type": test_dataset.iloc[:, 1].to_list()})

# Tokenize the sentences
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_dataset['Sentence'])
y_train = train_dataset['Type']

# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Test the classifier
X_test = vectorizer.transform(test_dataset['Sentence'])
y_test = test_dataset['Type']
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)

# Evaluate the classifier
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Train Accuracy:", accuracy_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)


Train Accuracy: 0.9949494949494949
Test Accuracy: 0.864406779661017


#### Using Term Frequency-Inverse Document Frequency (Tfidf)

In [3]:
# Load the dataset
train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
train_dataset = pd.DataFrame({"Sentence": train_dataset.iloc[:, 0].to_list(), "Type": train_dataset.iloc[:, 1].to_list()})
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
test_dataset = pd.DataFrame({"Sentence": test_dataset.iloc[:, 0].to_list(), "Type": test_dataset.iloc[:, 1].to_list()})

# Tokenize the sentences
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_dataset['Sentence'])
y_train = train_dataset['Type']

# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Test the classifier
X_test = vectorizer.transform(test_dataset['Sentence'])
y_test = test_dataset['Type']
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)

# Evaluate the classifier
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Train Accuracy:", accuracy_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)

Train Accuracy: 0.98989898989899
Test Accuracy: 0.8813559322033898


#### Using HashingVectorizer

In [4]:


# Load the dataset
train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
# Sample dataset
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split the dataset into training and test sets
docs_train, docs_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.25, random_state=None)

# Vectorize the text data using HashingVectorizer
vectorizer = HashingVectorizer(n_features=2**20, alternate_sign=False)
X_train = vectorizer.transform(docs_train)
X_test = vectorizer.transform(docs_test)

# Train a Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict the test set results
y_pred_train = nb.predict(X_train)
y_pred_test = nb.predict(X_test)

# Step 6: Evaluate the model
print("Train Accuracy:",accuracy_score(y_train, y_pred_train))
print("Test Accuracy:",accuracy_score(y_test, y_pred_test))

Train Accuracy: 0.953125
Test Accuracy: 0.7076923076923077


## Logistic Regression 

#### Using Bag-O-Word

In [5]:


train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
# Sample dataset
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a TF-IDF vectorizer and logistic regression classifier pipeline
pipeline = make_pipeline(CountVectorizer(), LogisticRegression())

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

# Evaluate the classifier
print("Train Accuracy:",accuracy_score(y_train, pred_train))
print("Test Accuracy:",accuracy_score(y_test, pred_test))

Train Accuracy: 1.0
Test Accuracy: 0.676923076923077


#### Using Tfidf

In [6]:


train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
# Sample dataset
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a TF-IDF vectorizer and logistic regression classifier pipeline
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

# Evaluate the classifier
print("Train Accuracy:",accuracy_score(y_train, pred_train))
print("Test Accuracy:",accuracy_score(y_test, pred_test))

Train Accuracy: 0.9895833333333334
Test Accuracy: 0.6615384615384615


## Random Forest

#### Using Bag-O-Words

In [7]:

train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a pipeline with TF-IDF vectorizer and RandomForest classifier
pipeline = Pipeline([
    ('tfidf', CountVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

# Evaluate the classifier
print("Train Accuracy:",accuracy_score(y_train, pred_train))
print("Test Accuracy:",accuracy_score(y_test, pred_test))

Train Accuracy: 1.0
Test Accuracy: 0.6923076923076923


#### Using Tfidf

In [8]:


train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a pipeline with TF-IDF vectorizer and RandomForest classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

# Evaluate the classifier
print("Train Accuracy:",accuracy_score(y_train, pred_train))
print("Test Accuracy:",accuracy_score(y_test, pred_test))

Train Accuracy: 1.0
Test Accuracy: 0.676923076923077
