### Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from cnn import TextCNN
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')





In [2]:
# Functions
# Data Cleaning and Preprocessing
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return lemmatized_tokens

def get_embeddings(tokens):
    embeddings = [word_vectors[token] for token in tokens if token in word_vectors]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)

In [3]:
# Load the dataset
data = pd.read_csv('data\\train.csv\\train.csv')

data['comment_text'] = data['comment_text'].apply(clean_text)

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

data['tokens'] = data['comment_text'].apply(tokenize_and_lemmatize)

# Word2Vec Embeddings
tokenized_texts = data['tokens'].tolist()
model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv

embedded_texts = np.array([get_embeddings(tokens) for tokens in tokenized_texts])

# Padding Sequences
max_length = max(len(tokens) for tokens in tokenized_texts)
padded_texts = pad_sequences(embedded_texts, maxlen=max_length, padding='post')

# Labels
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
labels = data[label_cols].values

# padded_texts is your feature matrix and labels is your target matrix

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hrita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hrita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(labels[6])
print(padded_texts[6])
print(len(padded_texts[6]))
print('device: ', device)

[1 1 1 0 1 0]
[0 0 1 ... 0 0 0]
1250
device:  cuda


In [5]:
batch_size = 32  

# Convert to PyTorch tensors
X = torch.tensor(padded_texts, dtype=torch.float32)
y = torch.tensor(labels, dtype=torch.float32)

# Create Dataset and DataLoader
dataset = TensorDataset(X, y)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [6]:
# Parameters
embedding_dim = 100  # Same as Word2Vec embedding size
num_classes = 6  # Number of output classes

# Model, Loss, Optimizer
model = TextCNN(embedding_dim, num_classes)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model = model.to(device)
criterion = criterion.to(device)

In [7]:
epochs = 10  # You can adjust this

for epoch in range(epochs):
    counter = 0
    for inputs, targets in train_loader:
        if (counter%20) == 0:
            print(f'Training epoch {epoch}, batch {counter}')
        counter += 1
        
        inputs = inputs.to(device).unsqueeze(1)
        print(inputs.shape)
        targets = targets.to(device)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Training epoch 0, batch 0
torch.Size([32, 1, 1250])


RuntimeError: Given groups=1, weight of size [128, 100, 5], expected input[32, 1, 1250] to have 100 channels, but got 1 channels instead