In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
train_dataset = pd.read_csv("train_clean_data.csv").sample(frac=1)
train_dataset = pd.DataFrame({"Sentence": train_dataset.iloc[:, 0].to_list(), "Type": train_dataset.iloc[:, 1].to_list()})
test_dataset = pd.read_csv("test_clean_data.csv").sample(frac=1)
test_dataset = pd.DataFrame({"Sentence": test_dataset.iloc[:, 0].to_list(), "Type": test_dataset.iloc[:, 1].to_list()})

# Tokenize the sentences
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_dataset['Sentence'])
y_train = train_dataset['Type']

# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Test the classifier
X_test = vectorizer.transform(test_dataset['Sentence'])
y_test = test_dataset['Type']
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.358974358974359


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

train_dataset = pd.read_csv("train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("test_clean_data.csv").sample(frac=1)
# Sample dataset
texts = train_dataset.iloc[:, 0].to_list() + test_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list() + test_dataset.iloc[:, 1].to_list()

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a TF-IDF vectorizer and logistic regression classifier pipeline
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
predictions = pipeline.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5142857142857142


In [24]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Create a pipeline with TF-IDF vectorizer and RandomForest classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the classifier
pipeline.fit(X_train, y_train)

# Predict the sentiment of the testing set
predictions = pipeline.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.4857142857142857


In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# Sample dataset
texts = ["I love this product", "Worst purchase ever", "Will buy again, happy with the product", "Not what I expected", "I'm so happy I found this"]
labels = [1, 0, 1, 0, 1]  # 1 for positive, 0 for negative

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=100)
features = vectorizer.fit_transform(texts).toarray()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train, X_test, y_train, y_test = map(torch.tensor, (X_train, X_test, np.array(y_train), np.array(y_test)))

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.float()
        self.y = y.long()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoader
train_dataset = TextDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define the model
class SentimentClassifier(nn.Module):
    def __init__(self, num_features):
        super(SentimentClassifier, self).__init__()
        self.fc1 = nn.Linear(num_features, 5)
        self.fc2 = nn.Linear(5, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SentimentClassifier(features.shape[1])

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(5):  # loop over the dataset multiple times
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

print("Finished Training")

# You can add code here for validation/testing


Epoch 1, Loss: 0.745625376701355
Epoch 2, Loss: 0.4215049743652344
Epoch 3, Loss: 0.7429709434509277
Epoch 4, Loss: 0.42887240648269653
Epoch 5, Loss: 0.7482790350914001
Finished Training


In [40]:
# Assuming X_test and y_test are already defined and processed

# Create test DataLoader
test_dataset = TextDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Function to calculate accuracy
def calculate_accuracy(model, data_loader):
    model.eval()  # Put the model in evaluation mode
    correct, total = 0, 0
    with torch.no_grad():  # No need to track gradients
        for inputs, labels in data_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Calculate and print test accuracy
test_accuracy = calculate_accuracy(model, test_loader)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.00


In [43]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

train_dataset = pd.read_csv("train_clean_data.csv").sample(frac=1)
test_dataset = pd.read_csv("test_clean_data.csv").sample(frac=1)
texts = train_dataset.iloc[:, 0].to_list()
labels = train_dataset.iloc[:, 1].to_list()

# Step 1: Preprocess the Dataset
# Tokenization
tokenized_texts = [[word.lower() for word in text.split()] for text in texts]

# Build vocabulary
word_counts = Counter(word for sentence in tokenized_texts for word in sentence)
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.most_common())}
vocab['<PAD>'] = 0  # Padding
vocab['<UNK>'] = 1  # Unknown words

# Convert texts and labels to tensors
text_tensor = [torch.tensor([vocab.get(word, vocab['<UNK>']) for word in sentence]) for sentence in tokenized_texts]
label_tensor = torch.tensor(labels)

# Padding sequences for batch processing
text_tensor = pad_sequence(text_tensor, batch_first=True, padding_value=vocab['<PAD>'])

# Step 2: Create DataLoader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

dataset = TextDataset(text_tensor, label_tensor)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Modify the Model slightly to handle the small vocab size and dataset
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden = hidden.squeeze(0)
        output = self.fc(hidden)
        return output

# Instantiate the model
model = RNN(len(vocab), embedding_dim=50, hidden_dim=20, output_dim=1)

# Training
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

def train(model, dataloader, optimizer, criterion):
    model.train()
    for texts, labels in dataloader:
        optimizer.zero_grad()
        predictions = model(texts).squeeze(1)
        loss = criterion(predictions, labels.float())
        loss.backward()
        optimizer.step()

train(model, dataloader, optimizer, criterion)

In [44]:

test_texts =  test_dataset.iloc[:, 0].to_list()
test_labels =  test_dataset.iloc[:, 1].to_list()  # 1 for positive, 0 for negative


# Step 1: Preprocess the testing dataset
tokenized_test_texts = [[word.lower() for word in text.split()] for text in test_texts]
test_text_tensor = [torch.tensor([vocab.get(word, vocab['<UNK>']) for word in sentence]) for sentence in tokenized_test_texts]
test_label_tensor = torch.tensor(test_labels)

# Padding sequences for batch processing
test_text_tensor = pad_sequence(test_text_tensor, batch_first=True, padding_value=vocab['<PAD>'])

# Step 2: Predict sentiments on the testing dataset
def predict(model, test_text_tensor):
    model.eval()  # Put the model in evaluation mode
    with torch.no_grad():
        predictions = model(test_text_tensor).squeeze(1)
        predictions = torch.sigmoid(predictions)
        predicted_labels = (predictions >= 0.5).int()  # Convert to 0/1 labels
    return predicted_labels

predicted_labels = predict(model, test_text_tensor)

# Step 3: Calculate accuracy
def calculate_accuracy(predicted_labels, true_labels):
    correct = (predicted_labels == true_labels).float()  # Convert to float for division 
    accuracy = correct.sum() / len(correct)
    return accuracy

accuracy = calculate_accuracy(predicted_labels, test_label_tensor)
print(f'Accuracy: {accuracy.item()}')


Accuracy: 0.25641027092933655
