In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np
import re

In [23]:
csv_path = "/kaggle/input/combined-llm-essay/train_essays_combined.csv"
df = pd.read_csv(csv_path)
print(len(df))

2078


In [20]:
# Data Preprocessing (Vectorization)
# Replace with your preprocessing logic
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['generated'].values

In [39]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             tokenizer=lambda x: re.findall(r'[^\W]+', x),
                             token_pattern=None,
                             strip_accents='unicode')
X = vectorizer.fit_transform(df['text']).toarray()
y = df['generated'].values

In [40]:
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
print(X_tensor.shape, y_tensor.shape)

torch.Size([2078, 17994]) torch.Size([2078])


In [41]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)


In [42]:
print(X_train.shape, X_test.shape)

torch.Size([1662, 17994]) torch.Size([416, 17994])


In [43]:
# Create Dataset and DataLoader for batch processing
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)


In [44]:
# Neural Network Definition
class TextClassifier(nn.Module):
    def __init__(self, input_size):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [45]:
# Model Initialization
model = TextClassifier(input_size=17994)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [46]:
# Training Loop
for epoch in range(10):  # number of epochs
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Epoch 1, Loss: 0.42881301045417786
Epoch 2, Loss: 0.051531240344047546
Epoch 3, Loss: 0.005774134770035744
Epoch 4, Loss: 0.003481342690065503
Epoch 5, Loss: 0.0014595434768125415
Epoch 6, Loss: 0.00190728681627661
Epoch 7, Loss: 0.0008285972289741039
Epoch 8, Loss: 0.0012144295033067465
Epoch 9, Loss: 0.0006635321769863367
Epoch 10, Loss: 0.0004038279876112938


In [47]:
# Model Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs.squeeze() > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')

Accuracy: 99.75961538461539%


In [48]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/combined-llm-essay/train_essays_combined.csv
/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv


In [49]:
df_test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
print(len(df_test))

3


In [50]:
X_test = vectorizer.transform(df_test['text']).toarray()
print(X_test)
# # Model Evaluation
model.eval()
predictions = []
with torch.no_grad():
    correct = 0
    total = 0
    for inputs in X_test:
        inputs = torch.Tensor(inputs)
        outputs = model(inputs)
        predictions.append(outputs.cpu().detach().numpy())
predictions = np.array(predictions).squeeze()
print(predictions)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0.40898916 0.40898916 0.40898916]


In [51]:
submission_df = pd.DataFrame({
    'id': df_test['id'],
    'generated': predictions
})

In [52]:
submission_df.to_csv('submission.csv', index=False)