In [11]:
!pwd
import os
os.chdir("..")

/Users/shenqingyun/Desktop/git_repository/DMAC/playground


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import json

from dmac.io.loader import Project1Loader

# Define a TextCNN model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, num_filters, filter_sizes):
        super(TextCNN, self).__init()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.fc(x)
        return x

# Load GloVe word embeddings
def load_glove_embeddings(glove_file, word_to_index):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vector

    embedding_matrix = np.zeros(len(word_to_index), len(embeddings_index['the']))
    for word, i in word_to_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Define hyperparameters
max_sequence_length = 100  # Define your sequence length
embedding_dim = 25
num_filters = 128
filter_sizes = [2, 3, 4]
learning_rate = 0.001
batch_size = 64
num_epochs = 10

# Load and preprocess the data
data = Project1Loader().load_data("data/exp1_data/train_data.txt")




Load complete, Data[0]: 
{'label': 0, 'raw': "I only watched the Wanda Sykes portion of this show. I think it was interesting to watch because it was before she came out as a lesbian. She was married to a man at the time. She actually made some jokes that raised my eyebrows since she is now a lesbian. I didn't like this because it seemed hypocritical but I think Wanda Sykes is really funny. She is one of the few comedians who can make me really LOL (Laugh Out Loud). If you want to see what her comedy was like before she came out a lesbian or you are a Wanda Sykes fan, watch it. I am a huge fan of hers. I would like to see her in a live how. I am glad she is true to herself now and came out as a lesbian. I hope Wanda keeps on making me and others laugh for a long time to come."}


In [15]:
from gensim import downloader

labels = [sample["label"] for sample in data]
sentences = [sample["raw"] for sample in data]

# Tokenize the text and convert to numerical values
vocab = set(word for sentence in sentences for word in sentence.split())
word_to_index = {word: i for i, word in enumerate(vocab)}
sequences = [[word_to_index[word] for word in sentence.split()] for sentence in sentences]

# Pad sequences to a fixed length
padded_sequences = [sequence[:max_sequence_length] + [0] * (max_sequence_length - len(sequence)) for sequence in sequences]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Create PyTorch DataLoader
train_dataset = TensorDataset(torch.LongTensor(X_train), torch.LongTensor(y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Load pre-trained GloVe embeddings
embedding_matrix = downloader.load("glove-twitter-25")

In [None]:
# Create the TextCNN model
model = TextCNN(vocab_size=len(vocab), embedding_dim=embedding_dim, num_classes=10, num_filters=num_filters, filter_sizes=filter_sizes)
model.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
model.embedding.weight.requires_grad = False  # Freeze the embedding layer

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

# Test the model
model.eval()
y_pred = []
with torch.no_grad():
    for inputs in X_test:
        inputs = torch.LongTensor(inputs).unsqueeze(0)
        outputs = model(inputs)
        predicted_labels = torch.argmax(outputs, 1)
        y_pred.append(predicted_labels.item())

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='micro')
print(f"F1 Score: {f1}")