<a href="https://colab.research.google.com/github/McIlwee-Nevan/CS482-Final-Project_BeierMamarilMcIlwee/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rtatman/deceptive-opinion-spam-corpus")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rtatman/deceptive-opinion-spam-corpus?dataset_version_number=2...


100%|██████████| 456k/456k [00:00<00:00, 50.5MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/rtatman/deceptive-opinion-spam-corpus/versions/2





Import Dataset and Preprocess

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

data = pd.read_csv(path+"/deceptive-opinion.csv")
X = np.copy(data['text'])
y = np.copy(data['deceptive'])
y = np.array(y == 'truthful').astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


Next, build and evaluate the models:

In [5]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Define classifiers
classifiers = [
    LinearRegression(),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=20),
    XGBClassifier()
]

# Define classifier labels
classifier_labels = ['Linear Regression', 'Decision Tree', 'Random Forest', 'XGBoost']

accuracies = np.zeros(4)
precision = np.zeros(4)
recall = np.zeros(4)
f1 = np.zeros(4)

index = 0
for model in classifiers:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if isinstance(model, LinearRegression):
        for i in range(len(y_pred)):
            y_pred[i] = 1 if y_pred[i] > 0.5 else 0
        accuracies[index] = accuracy_score(y_true=y_test, y_pred=y_pred)
        precision_score, recall_score, f1_score, support_score = precision_recall_fscore_support(y_true=y_test, y_pred=y_pred, average='binary')
        precision[index] = precision_score
        recall[index] = recall_score
        f1[index] = f1_score
        print(f'{classifier_labels[index]} Accuracy: {accuracies[index]:.4f}')
        print(f'{classifier_labels[index]} Precision: {precision[index]:.4f}, Recall: {recall[index]:.4f}, F1 Score: {f1[index]:.4f}')
    else:
        accuracies[index] = accuracy_score(y_true=y_test, y_pred=y_pred)
        precision_score, recall_score, f1_score, support_score = precision_recall_fscore_support(y_true=y_test, y_pred=y_pred, average='binary')
        precision[index] = precision_score
        recall[index] = recall_score
        f1[index] = f1_score
        print(f'{classifier_labels[index]} Accuracy: {accuracies[index]:.4f}')
        print(f'{classifier_labels[index]} Precision: {precision[index]:.4f}, Recall: {recall[index]:.4f}, F1 Score: {f1[index]:.4f}')
    print('\n')
    index += 1

Linear Regression Accuracy: 0.8500
Linear Regression Precision: 0.8846, Recall: 0.8214, F1 Score: 0.8519


Decision Tree Accuracy: 0.7219
Decision Tree Precision: 0.7365, Recall: 0.7321, F1 Score: 0.7343


Random Forest Accuracy: 0.8406
Random Forest Precision: 0.8874, Recall: 0.7976, F1 Score: 0.8401


XGBoost Accuracy: 0.7969
XGBoost Precision: 0.8160, Recall: 0.7917, F1 Score: 0.8036




Train and Test Neural Network

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Create tensors from data set, create data loaders
X_train_tensor = torch.from_numpy(X_train.todense()).to(torch.float32)
X_test_tensor = torch.from_numpy(X_test.todense()).to(torch.float32)
y_train_tensor = torch.from_numpy(y_train)
y_test_tensor = torch.from_numpy(y_test)

train_loader = DataLoader(list(zip(X_train_tensor, y_train_tensor)), batch_size=64, shuffle=True)
test_loader = DataLoader(list(zip(X_test_tensor, y_test_tensor)), batch_size=64, shuffle=True)

In [9]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim : int):
        super(NeuralNet, self).__init__()

        self.fc1 = nn.Linear(input_dim, 400)
        self.fc2 = nn.Linear(400, 200)
        self.fc3 = nn.Linear(200, 100)
        self.fc4 = nn.Linear(100, 2)

        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.relu(self.fc1(x))
        out = self.relu(self.fc2(out))
        out = self.relu(self.fc3(out))
        out = self.relu(self.fc4(out))

        return out

In [10]:
# Initialize model and choose criterion/optimizer
nn_model = NeuralNet(8703)
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [11]:
# Train model
num_epochs = 10
loss_arr = np.zeros(num_epochs)

for epoch in range(num_epochs):
    nn_model.train()

    for (x, y) in train_loader:
        x, y = x.to(device), y.to(device)

        z = nn_model.forward(x)
        loss = criterion(z, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    loss_arr[epoch] = loss.item()

Epoch [1/10], Loss: 0.6720
Epoch [2/10], Loss: 0.2862
Epoch [3/10], Loss: 0.0021
Epoch [4/10], Loss: 0.0049
Epoch [5/10], Loss: 0.0026
Epoch [6/10], Loss: 0.0002
Epoch [7/10], Loss: 0.0001
Epoch [8/10], Loss: 0.0001
Epoch [9/10], Loss: 0.0001
Epoch [10/10], Loss: 0.0001


In [12]:
def evaluate_model(model, data_loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for (x, y) in data_loader:
            x, y = x.to(device), y.to(device)
            z = model(x)
            _, predicted = torch.max(z, 1)
            y_true.extend(y.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    return y_true, y_pred

In [13]:
# Get accuracy and print report
y_true, y_pred = evaluate_model(nn_model, test_loader)
print("Classification Report: \n", classification_report(y_true, y_pred))
print("Accuracy: ", accuracy_score(y_true, y_pred))

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.89      0.87       152
           1       0.90      0.86      0.88       168

    accuracy                           0.88       320
   macro avg       0.88      0.88      0.88       320
weighted avg       0.88      0.88      0.88       320

Accuracy:  0.878125


In [15]:
# Save epochs, loss, and parameters
torch.save({
            'epoch': num_epochs,
            'model_state_dict': nn_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss_arr[-1],
            }, './nn_model.pt')

from google.colab import files
files.download('./nn_model.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
import torch.nn.functional as F

user_review = input("Enter a review: ")

user_review = vectorizer.transform([user_review])
user_review = torch.from_numpy(user_review.todense()).to(torch.float32)

neural_network = NeuralNet(8703)

output = neural_network.forward(user_review)

probabilities = F.softmax(output, dim=1)

predicted_class = torch.argmax(probabilities, dim=1).item()

print(predicted_class)

Enter a review: m a local and I’ve had a few interactions with staff here over the years and every single one has been excellent. The staff here are friendly and genuinely kind and that is honestly such an asset to this location. Shoutout to Mikayla and Ashley for being especially kind and accommodating. I’m truly grateful!
1
