In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam




In [3]:
# Load the spacy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Load stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Data cleaning: Lowercasing, removing unwanted characters
    text = text.lower()
    
    # Tokenization and Lemmatization
    doc = nlp(text)
    lemmatized = [token.lemma_ for token in doc]

    # Removing stop words
    cleaned_text = [word for word in lemmatized if word not in stop_words]

    return ' '.join(cleaned_text)

# Load dataset
df = pd.read_csv('data\\train.csv\\train.csv')

# Preprocess the text data
df['processed_text'] = df['comment_text'].apply(preprocess_text)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features based on your dataset
X = vectorizer.fit_transform(df['processed_text']).toarray()

# Labels
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# Padding sequences
maxlen = 100  # This is an arbitrary number, adjust based on your data
X_padded = pad_sequences(X, padding='post', maxlen=maxlen)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# The X_train, X_test, y_train, and y_test are now ready to be fed into a 1D CNN.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, filter_sizes, num_filters):
        super(TextCNN, self).__init__()

        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # Convolutional Layers with different filter sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, 
                      out_channels=num_filters, 
                      kernel_size=fs)
            for fs in filter_sizes
        ])

        # Dropout layer
        self.dropout = nn.Dropout(0.5)

        # Fully connected layer
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_length, embed_dim]

        x = x.permute(0, 2, 1)  # [batch_size, embed_dim, seq_length]

        # Apply the convolutional and max pooling layers
        x = [F.relu(conv(x)) for conv in self.convs]
        x = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in x]

        x = torch.cat(x, 1)

        # Apply dropout
        x = self.dropout(x)

        # Pass the output through the fully connected layer
        x = self.fc(x)

        return x

In [None]:
# Convert your preprocessed data into PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float)

# Create TensorDatasets and DataLoaders
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 32  # You can adjust this
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

# Model parameters
vocab_size = ...  # Size of your vocabulary
embed_dim = 128  # Embedding dimension
num_classes = 6   # Number of output classes
filter_sizes = [3, 4, 5]  # Different filter sizes
num_filters = 100 # Number of filters per filter size

# Instantiate the model
model = TextCNN(vocab_size, embed_dim, num_classes, filter_sizes, num_filters)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function and optimizer
loss_function = nn.BCEWithLogitsLoss()  # Suitable for multi-label classification
optimizer = Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 10  # You can adjust this

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        # Forward pass
        outputs = model(texts)
        loss = loss_function(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")