In [16]:
import numpy as np
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

## Naive Bayes

#### Using Bag-O-Words

In [17]:
# Load the dataset
train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
train_dataset = pd.DataFrame({"Sentence": train_dataset.iloc[:, 0].to_list(), "Type": train_dataset.iloc[:, 1].to_list()})
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
test_dataset = pd.DataFrame({"Sentence": test_dataset.iloc[:, 0].to_list(), "Type": test_dataset.iloc[:, 1].to_list()})

# Tokenize the sentences
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_dataset['Sentence'])
y_train = train_dataset['Type']

# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Test the classifier
X_test = vectorizer.transform(test_dataset['Sentence'])
y_test = test_dataset['Type']
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.864406779661017


#### Using Term Frequency-Inverse Document Frequency (Tfid)

In [18]:
# Load the dataset
train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
train_dataset = pd.DataFrame({"Sentence": train_dataset.iloc[:, 0].to_list(), "Type": train_dataset.iloc[:, 1].to_list()})
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
test_dataset = pd.DataFrame({"Sentence": test_dataset.iloc[:, 0].to_list(), "Type": test_dataset.iloc[:, 1].to_list()})

# Tokenize the sentences
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_dataset['Sentence'])
y_train = train_dataset['Type']

# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Test the classifier
X_test = vectorizer.transform(test_dataset['Sentence'])
y_test = test_dataset['Type']
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8813559322033898


#### Using HashingVectorizer

In [19]:


# Load the dataset
train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
# Sample dataset
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split the dataset into training and test sets
docs_train, docs_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.25, random_state=None)

# Vectorize the text data using HashingVectorizer
vectorizer = HashingVectorizer(n_features=2**20, alternate_sign=False)
X_train = vectorizer.transform(docs_train)
X_test = vectorizer.transform(docs_test)

# Train a Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict the test set results
y_pred = nb.predict(X_test)

# Step 6: Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.8153846153846154


## Logistic Regression 

#### Using Bag-O-Word

In [20]:


train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
# Sample dataset
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a TF-IDF vectorizer and logistic regression classifier pipeline
pipeline = make_pipeline(CountVectorizer(), LogisticRegression())

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
predictions = pipeline.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6615384615384615


#### Using Tfid

In [21]:


train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
# Sample dataset
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a TF-IDF vectorizer and logistic regression classifier pipeline
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
predictions = pipeline.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6307692307692307


## Random Forest

#### Using Bag-O-Words

In [28]:

train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a pipeline with TF-IDF vectorizer and RandomForest classifier
pipeline = Pipeline([
    ('tfidf', CountVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
predictions = pipeline.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7076923076923077


#### Using Tfid

In [27]:


train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a pipeline with TF-IDF vectorizer and RandomForest classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
predictions = pipeline.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7230769230769231


## TESTING

In [10]:


train_dataset = pd.read_csv("dataset_dim2/train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("dataset_dim2/test_clean_data.csv").sample(frac=1)
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=100)
features = vectorizer.fit_transform(texts).toarray()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train, X_test, y_train, y_test = map(torch.tensor, (X_train, X_test, np.array(y_train), np.array(y_test)))

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.float()
        self.y = y.long()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoader
# Create test DataLoader
test_dataset = TextDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

model = SentimentClassifier(features.shape[1])

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(5):  # loop over the dataset multiple times
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

print("Finished Training")
# Validation loop
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy: {accuracy}")

# You can add code here for validation/testing


Epoch 1, Loss: 0.6291394233703613
Epoch 2, Loss: 0.6717043519020081
Epoch 3, Loss: 0.5689048171043396
Epoch 4, Loss: 0.6689860820770264
Epoch 5, Loss: 0.7063190340995789
Finished Training
Accuracy: 0.5576923076923077
