In [1]:
import torch
import json
import numpy as np
import pandas as pd
import math
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset,  DataLoader
from torch import Tensor

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn import metrics



In [2]:
with open('dataset/set1_human.json') as f:
    set1_human = json.load(f)

with open('dataset/set1_machine.json') as f:
    set1_machine = json.load(f) 

print(len(set1_human), len(set1_machine))

122584 3500


In [3]:
with open('dataset/test.json') as f:
    testing_set = json.load(f) 

In [4]:
testing_set = testing_set[:600]

In [5]:
for data in set1_human:
    data['label'] = 1.0
for data in set1_machine:
    data['label'] = 0.0

In [6]:
data = set1_human + set1_machine
y_set = np.concatenate([np.ones(len(set1_human)), np.zeros(len(set1_machine))])
train_data, test_data, _, _ = train_test_split(data, y_set, test_size=0.1, stratify=y_set)

In [7]:
batch_size = 256 

class TrainDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        text = self.data[idx]['txt'][:250]
        text = torch.tensor(text)
        if len(text) < 250:
            text = torch.nn.functional.pad(text, (0, 250 - len(text)), "constant", 0)
        label = torch.tensor(self.data[idx]['label'])

        return text, label

In [8]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size1, hidden_size2):
        super(Classifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_size)

        self.model = nn.Sequential(
            nn.Linear(emb_size, hidden_size1),
            nn.ReLU(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.ReLU(),
            nn.Linear(hidden_size2, 1),
        )

    def forward(self, text):
        text_embed = self.embedding(text) 
        text_embed = text_embed.mean(dim=1) 
        output = self.model(text_embed)
        return output.squeeze() 


In [9]:
class TestDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        text = self.data[idx]['txt'][:250]
        text = torch.tensor(text)
        if len(text) < 250:
            text = torch.nn.functional.pad(text, (0, 250 - len(text)), "constant", 0)

        return text

In [10]:
train_dataset = TrainDataset(train_data)
training_set = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TrainDataset(test_data)
testing_dataset= DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

clf = Classifier(5000, 256, 150, 200).to(device)

opt = Adam(clf.parameters(), lr=1e-3)

loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.3, dtype=torch.float))

'''weights = [0.2] * 256
weights = torch.tensor(weights)
nn.CrossEntropyLoss(weight=weights)'''

score = 0.

for epoch in range(10):
    clf.train()

    batch_loss = 0.0
    train_preds = []
    train_targets = []
    
    for batch in training_set:
        text, label = batch[0].to(device), batch[1].to(device)
        y_hat = clf(text)
        loss = loss_function(y_hat, label)

        opt.zero_grad()
        loss.backward()
        opt.step()

        batch_loss += loss.item()
        train_preds += torch.round(torch.sigmoid(y_hat)).tolist()
        train_targets += label.tolist()
    
    batch_loss /= len(training_set)
    train_f1 = f1_score(train_targets, train_preds, average='macro')
    
    clf.eval()
    
    test_loss = 0.0 
    test_preds = []
    test_targets = []
    with torch.no_grad():
        for batch in testing_dataset:
            text, label = batch[0].to(device), batch[1].to(device)
            y_hat = clf(text)  
            loss = loss_function(y_hat, label)   

            test_loss += loss.item()
            test_preds += torch.round(torch.sigmoid(y_hat)).tolist()
            test_targets += label.tolist()   
    test_loss /= len(testing_dataset)
    
    test_f1 = f1_score(test_targets, test_preds, average='macro')
    
    print(f'Epoch {epoch}: loss is {loss.item():.4f}, Train f1 score is {train_f1:.4f}, Test f1 score is {test_f1:.4f}')
    
    if test_f1 > score:
        score = test_f1
        torch.save(clf.state_dict(), "classifier.pt")

In [12]:
test_dataset = TestDataset(testing_set)
testing_dataset = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
clf.eval()
test_preds = []

with torch.no_grad():
    for batch in testing_dataset:

        text = batch.to(device)
        
        y_hat = clf(text)
        
        test_preds += torch.round(torch.sigmoid(y_hat)).tolist()
        


In [14]:
y_pred = [int(num) for num in test_preds]

In [15]:
df = pd.DataFrame(y_pred, columns = ['Predicted'])

In [16]:
df.to_csv('out.csv')