In [1]:
#Task1.1

In [2]:
import pandas as pd
import torch
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords

from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

import nltk
nltk.download('punkt')
nltk.download('stopwords')



def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]', '', regex=True)  # NEW (explicit)                                                      # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return data

# If this is the primary file that is executed (ie not an import of another file)
if __name__ == "__main__":
    # get data, pre-process and split
    file_path = r"C:\Users\toste\Downloads\amazon_cells_labelled.txt"
    data = pd.read_csv(file_path, delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    data = preprocess_pandas(data, columns)                             # pre-process
    training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
        data['Sentence'].values.astype('U'),
        data['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )

    # vectorize data using TFIDF and transform for PyTorch for scalability
    word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
    training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
    training_data = training_data.todense()                             # convert to dense matrix for Pytorch
    vocab_size = len(word_vectorizer.vocabulary_)
    validation_data = word_vectorizer.transform(validation_data)
    validation_data = validation_data.todense()
    train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
    train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
    validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
    validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\toste\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\toste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import pandas as pd
import os
print(os.getcwd())


file_path = r"C:\Users\toste\Downloads\amazon_cells_labelled.txt"
df = pd.read_csv(file_path, sep="\t", header=None, names=["text", "label"])

# Step 2: View the first few rows
print(df.head())

# Optional: Check distribution of labels
print(df['label'].value_counts())



C:\Users\toste
                                                text  label
0  So there is no way for me to plug it in here i...      0
1                        Good case, Excellent value.      1
2                             Great for the jawbone.      1
3  Tied to charger for conversations lasting more...      0
4                                  The mic is great.      1
0    500
1    500
Name: label, dtype: int64


In [4]:
import torch.nn as nn
import torch.optim as optim

# Define a simple feedforward neural network
class SentimentANN(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super(SentimentANN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_size, 2)  # 2 output classes (negative or positive)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [5]:
# Initialize model
input_size = train_x_tensor.shape[1]
model = SentimentANN(input_size)

# Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(train_x_tensor)
    loss = criterion(outputs, train_y_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 0.6999
Epoch [2/10], Loss: 0.6969
Epoch [3/10], Loss: 0.6934
Epoch [4/10], Loss: 0.6900
Epoch [5/10], Loss: 0.6858
Epoch [6/10], Loss: 0.6815
Epoch [7/10], Loss: 0.6767
Epoch [8/10], Loss: 0.6714
Epoch [9/10], Loss: 0.6661
Epoch [10/10], Loss: 0.6604


In [6]:
# Evaluation
model.eval()
with torch.no_grad():
    predictions = model(validation_x_tensor)
    predicted_labels = torch.argmax(predictions, axis=1)

# Metrics
print("\nClassification Report:")
print(classification_report(validation_y_tensor, predicted_labels))


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        47
           1       0.53      1.00      0.69        53

    accuracy                           0.53       100
   macro avg       0.27      0.50      0.35       100
weighted avg       0.28      0.53      0.37       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
#Task1.1 larger training set

In [3]:
import pandas as pd
import torch
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords

from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

import nltk
nltk.download('punkt')
nltk.download('stopwords')



def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]', '', regex=True)  # NEW (explicit)                                                      # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return data

# If this is the primary file that is executed (ie not an import of another file)
if __name__ == "__main__":
    # get data, pre-process and split
    file_path = r"C:\Users\toste\Downloads\amazon_cells_labelled_LARGE_25K.txt"
    data = pd.read_csv(file_path, delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    data = preprocess_pandas(data, columns)                             # pre-process
    training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
        data['Sentence'].values.astype('U'),
        data['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )

    # vectorize data using TFIDF and transform for PyTorch for scalability
    word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
    training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
    training_data = training_data.todense()                             # convert to dense matrix for Pytorch
    vocab_size = len(word_vectorizer.vocabulary_)
    validation_data = word_vectorizer.transform(validation_data)
    validation_data = validation_data.todense()
    train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
    train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
    validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
    validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\toste\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\toste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pandas as pd
import os
print(os.getcwd())


file_path = r"C:\Users\toste\Downloads\amazon_cells_labelled_LARGE_25K.txt"
df = pd.read_csv(file_path, sep="\t", header=None, names=["text", "label"])

# Step 2: View the first few rows
print(df.head())

# Optional: Check distribution of labels
print(df['label'].value_counts())


In [5]:
import torch.nn as nn
import torch.optim as optim

# Define a simple feedforward neural network
class SentimentANN(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super(SentimentANN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_size, 2)  # 2 output classes (negative or positive)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [6]:
# Initialize model
input_size = train_x_tensor.shape[1]
model = SentimentANN(input_size)

# Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(train_x_tensor)
    loss = criterion(outputs, train_y_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 0.6981
Epoch [2/10], Loss: 0.6951
Epoch [3/10], Loss: 0.6917
Epoch [4/10], Loss: 0.6875
Epoch [5/10], Loss: 0.6828
Epoch [6/10], Loss: 0.6778
Epoch [7/10], Loss: 0.6727
Epoch [8/10], Loss: 0.6675
Epoch [9/10], Loss: 0.6620
Epoch [10/10], Loss: 0.6562


In [7]:
# Evaluation
model.eval()
with torch.no_grad():
    predictions = model(validation_x_tensor)
    predicted_labels = torch.argmax(predictions, axis=1)

# Metrics
print("\nClassification Report:")
print(classification_report(validation_y_tensor, predicted_labels))


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1011
           1       0.83      0.95      0.89      1489

    accuracy                           0.86      2500
   macro avg       0.87      0.83      0.84      2500
weighted avg       0.86      0.86      0.85      2500



In [None]:
#Task 1.2

In [8]:
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# 1. Load the Amazon dataset
file_path = r"C:\Users\toste\Downloads\amazon_cells_labelled.txt"
df = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'label'])

# 2. Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=512)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label'].values

# 3. Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, emb]
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# 5. Define a Transformer-based sentiment classifier
class TransformerSentimentClassifier(nn.Module):
    def __init__(self, dim=512, num_classes=2, num_heads=4, num_layers=2):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(dim, num_classes)

    def forward(self, x):
        x = self.transformer(x)
        x = x.mean(dim=1)  # mean over sequence
        return self.classifier(x)

# 6. Instantiate and train the model
model = TransformerSentimentClassifier(dim=512)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print("Training...")
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/10 - Loss: {loss.item():.4f}")

# 7. Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(X_val_tensor)
    preds = torch.argmax(outputs, dim=1)

print("\nClassification Report:")
print(classification_report(y_val, preds.numpy()))


Training...
Epoch 1/10 - Loss: 0.7602
Epoch 2/10 - Loss: 1.8161
Epoch 3/10 - Loss: 2.9878
Epoch 4/10 - Loss: 1.1187
Epoch 5/10 - Loss: 0.9864
Epoch 6/10 - Loss: 0.6333
Epoch 7/10 - Loss: 0.2949
Epoch 8/10 - Loss: 0.4198
Epoch 9/10 - Loss: 0.3079
Epoch 10/10 - Loss: 0.1833

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.91      0.82        93
           1       0.91      0.73      0.81       107

    accuracy                           0.81       200
   macro avg       0.83      0.82      0.81       200
weighted avg       0.83      0.81      0.81       200



In [10]:
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# 1. Load the Amazon dataset
file_path = r"C:\Users\toste\Downloads\amazon_cells_labelled_LARGE_25K.txt"
df = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'label'])

# 2. Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=512)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label'].values

# 3. Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, emb]
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# 5. Define a Transformer-based sentiment classifier
class TransformerSentimentClassifier(nn.Module):
    def __init__(self, dim=512, num_classes=2, num_heads=4, num_layers=2):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(dim, num_classes)

    def forward(self, x):
        x = self.transformer(x)
        x = x.mean(dim=1)  # mean over sequence
        return self.classifier(x)

# 6. Instantiate and train the model
model = TransformerSentimentClassifier(dim=512)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print("Training...")
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/10 - Loss: {loss.item():.4f}")

# 7. Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(X_val_tensor)
    preds = torch.argmax(outputs, dim=1)

print("\nClassification Report:")
print(classification_report(y_val, preds.numpy()))


Training...
Epoch 1/10 - Loss: 0.7679
Epoch 2/10 - Loss: 4.5095
Epoch 3/10 - Loss: 1.8153
Epoch 4/10 - Loss: 1.9324
Epoch 5/10 - Loss: 0.8216
Epoch 6/10 - Loss: 1.0137
Epoch 7/10 - Loss: 1.0012
Epoch 8/10 - Loss: 0.6770
Epoch 9/10 - Loss: 0.5360
Epoch 10/10 - Loss: 0.6123

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.95      0.73      1926
           1       0.95      0.59      0.73      3074

    accuracy                           0.73      5000
   macro avg       0.77      0.77      0.73      5000
weighted avg       0.81      0.73      0.73      5000

