In [1]:
import pandas as pd
import numpy as np
from gensim.models import word2vec, Word2Vec
from torch import nn
import os
from tqdm import tqdm

### Utils

In [2]:
from gensim import utils

# sampling 
def downsampling(data, ratio):
    majority = data[data["target"] == 0]
    minority = data[data["target"] == 1]
    lower_majority = majority.sample(n=int(ratio*len(minority)), replace=False, random_state=20, axis=0)
    return pd.concate([lower_majority, minority])
   
def evaluation(outputs, labels):
    # outputs => probability (float)
    # labels => labels
    outputs[outputs>=0.5] = 1 # 大於等於 0.5 為正面
    outputs[outputs<0.5] = 0 # 小於 0.5 為負面
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

def tokenize_text(text):
    """
    text: string
    return: list of word
    """
    return utils.simple_preprocess(text) #lower case, deaccent, remove punctuation, split


### Training word2vec model on quora data

In [3]:
from gensim import utils

# train word to vector
def train_word2vec(x):
    model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
    return model
if False:
    print("Loading training data")
    train = pd.read_csv(data_path + "train.csv")
    sentences = train["question_text"].apply(utils.simple_preprocess)

    model = train_word2vec(sentences)
    print("saving model ...")
    model.save(w2v_path)

Reference: https://colab.research.google.com/drive/16d1Xox0OW-VNuxDn1pvy2UXFIPfieCb9

In [26]:
def load_embed(typeToLoad):
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float16')

    if typeToLoad == "glove":
        file = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding = "utf8", errors='ignore') if len(o) > 100)
    elif typeToLoad == "word2vec":
        # file = 'embeddings⁩/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin⁩'
        file = 'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
        embeddings_index = KeyedVectors.load_word2vec_format(file, binary=True)  # query word vector from the file
    elif typeToLoad == "fasttext":
        # file = "⁨embeddings⁩/wiki-news-300d-1M⁩/wiki-news-300d-1M.vec"
        file = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin', errors='ignore'))
    elif typeToLoad == "paragram":
        file = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding = "utf8", errors='ignore') )
    elif typeToLoad == 'trained_word2vec':
        file = w2v_path
        embeddings_index = Word2Vec.load(file)

    return embeddings_index

### Data Preprocess

In [68]:
class Preprocess():
    def __init__(self, sentences, embedding, embedding_dim):
        self.sentences = sentences
        self.embedding = embedding
        self.embedding_dim = embedding_dim
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
        
    def add_embedding(self,word):
        #use random vector to represent words not in word embedding model, euch as UNK
        vector = torch.empty(1, self.embedding_dim, dtype=torch.float)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix.float(), vector], 0)
        
    def make_embedding(self):
        #make embedding matrix
        print("Making embedding...")
        for word, vector in self.embedding.items():
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(vector)
        print("loop is done")
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix
    
    def pad_sentence(self, sentence, max_len=20):
        if len(sentence) > max_len:
            sentence = sentence[:max_len]
        else:
            pad_len = max_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == max_len
        return sentence
    
    def sentence_word2idx(self, max_len):
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print("sentence count #{}".format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if word in self.word2idx.keys():
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            sentence_idx = self.pad_sentence(sentence_idx, max_len)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    
    def labels_to_tensor(self, y):
        y = [int(label) for label in y]
        return torch.LongTensor(y)
        

In [66]:
vector = torch.empty(1, 300, dtype=torch.float)
torch.nn.init.uniform_(vector)
torch.cat([preprocess.embedding_matrix.float(), vector], 0)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3494806800 bytes. Buy new RAM!


In [67]:
gc.collect()

218

In [76]:
preprocess.add_embedding("<PAD>")
preprocess.add_embedding("<UNK>")

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3494806800 bytes. Buy new RAM!


### Dataset

In [6]:
# generate dataset for dataLoader
import torch
from torch.utils import data

class CustomDataset(data.Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)
    
    __len__ will return the number of data
    """
    def __init__(self, X, y=None):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

### Model

In [7]:
import torch
from torch import nn
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # if fix_embedding is False, embedding weight wont be updated
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid() )
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x 的 dimension (batch, seq_len, hidden_size)
        # 取用 LSTM 最後一層的 hidden state
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

### Train

In [8]:
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    
    criterion = nn.BCELoss() #binary cross entropy loss function 
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = optim.Adam(model.parameters(), lr=lr) # 
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        
        model.train() # training mode, weights are updated
        total_loss, total_acc = 0, 0
        # training module
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long) 
            labels = labels.to(device, dtype=torch.float) 
            optimizer.zero_grad() 
            outputs = model(inputs) 
            outputs = outputs.squeeze() 
            loss = criterion(outputs, labels) # compute training loss
            loss.backward() # 算 loss 的 gradient
            optimizer.step() 
            correct = evaluation(outputs, labels)
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

        # validation module
        model.eval() # fixed weights
        with torch.no_grad(): #disables tracking of gradients in autograd.
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long) 
                labels = labels.to(device, dtype=torch.float) 
                outputs = model(inputs) 
                outputs = outputs.squeeze() 
                loss = criterion(outputs, labels) 
                correct = evaluation(outputs, labels) 
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc > best_acc:
                best_acc = total_acc
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
        print('-----------------------------------------------')


### Test

In [9]:
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(tqdm(test_loader)):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze(0) #squeeze removes all demensions with input size of 1, squeeze(0) only 
            outputs[outputs>=0.5] = 1 # 大於等於 0.5 為正面
            outputs[outputs<0.5] = 0 # 小於 0.5 為負面
            ret_output += outputs.int().tolist()
    
    return ret_output

### Main

In [None]:
# main.py
import os
import gc
import torch
import argparse
import numpy as np
from torch import nn
from gensim.models import word2vec

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.chdir("../quora-insincere-questions-classification/")
print(os.getcwd())

data_path = "./"
output_path = "result/"
w2v_path = os.path.join(output_path, 'w2v_all.model')
embedding_dim = 300
cv_n = 5
train_ratio = 0.2
val_ratio = 0.01
sen_len = 20
fix_embedding = True # fix embedding during training
batch_size = 256
epoch = 5
lr = 0.001
model_dir = output_path # model directory for checkpoint model

In [38]:
print("Loading embedding")
glove_embedding = load_embed("glove")
paragram_embedding = load_embed("paragram")

Loading embedding


In [40]:
print("Combining embedding")
corpus = set(list(glove_embedding.keys()) + list(paragram_embedding.keys()))
init_vector = [0]*embedding_dim
embedding = dict()
for word in tqdm(corpus):
    glove_vector = glove_embedding.get(word, init_vector)
    paragram_vector = paragram_embedding.get(word, init_vector)
    vector = 0.7*np.asarray(glove_vector) + 0.3*np.asarray(paragram_vector)
    embedding[word] = vector

Combining embedding


100%|█████████████████████████████████████████████████████████████████████| 2912338/2912338 [01:47<00:00, 26969.08it/s]


In [46]:
del glove_embedding
del paragram_embedding
gc.collect()

362

In [74]:
del corpus
gc.collect()

167

In [32]:
%%time
print("Loading training data")
train = pd.read_csv(data_path + "train.csv")
train_x, y = train["question_text"], train["target"]
train_x = train_x.apply(tokenize_text)

Loading training data
Wall time: 40.7 s


In [75]:
%%time
# preprocess training data
preprocess = Preprocess(train_x, embedding, embedding_dim)
embedding_matrix = preprocess.make_embedding()
train_x = preprocess.sentence_word2idx(max_len=sen_len)
y = preprocess.labels_to_tensor(y)

Making embedding...
loop is done


RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3494806800 bytes. Buy new RAM!


In [55]:
#train_x = preprocess.sentence_word2idx(max_len=sen_len)
y = preprocess.labels_to_tensor(y)

sentence count #52

KeyError: '<UNK>'

In [159]:
from sklearn.t=metrics import f1_score
from sklearn.model_selection import StratifiedKFold

#cv
skf = StratifiedKFold(n_split=cv_n)
cv_k = 0
resuls = []
for train_index, test_index in skf.split(train_x, y):
    cv_k += 1
    print("--------- cv", cv_k)
    X_train, y_train = train_x[train_index], y[train_index]
    X_test, y_test = train_x[test_index], y[test_index]
    
    train_len = int(len(X_train) * (1-val_ratio))
    X_train, X_val, y_train, y_val = X_train[:train_len], X_train[train_len:], y_train[:train_len], y_train[train_len:]
    print("train size {}, validation size {}".format(train_len, len(X_train) - train_len))

    train_dataset = CustomDataset(X=X_train, y=y_train)
    val_dataset = CustomDataset(X=X_val, y=y_val)
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                                batch_size = batch_size,
                                                shuffle = True,
                                                num_workers = 0)

    val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                                batch_size = batch_size,
                                                shuffle = False,
                                                num_workers = 0)

    model = LSTM_Net(embedding_matrix, \
                     embedding_dim=embedding_dim, \
                     hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
    model = model.to(device) 
    
    print("Training")
    model_path = os.path.join(model_dir,f'qiqc_cv{cv_k}.model'
    training(batch_size, epoch, lr, model_path, train_loader, val_loader, model, device)
    
    print("Testing")
    test_dataset = CustomDataset(X=X_test, y=None)
    test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                                batch_size = batch_size,
                                                shuffle = False,
                                                num_workers = 0)
    print('\nload model ...')
    model = torch.load(model_path)
    outputs = testing(batch_size, test_loader, model, device)
    f1 = f1_score(y_test, outputs)
    print("f1 score", f1)
    results.append(f1)
print("Average cv score", np.mean(results) )

Loading training data
Making embedding...
loop is done48488
total words 48490
train size 219011, validation size 2213

start training, parameter total:12363851, trainable:241351

[ Epoch1: 856/856 ] loss:0.112 acc:47.656 
Train | Loss:0.14956 Acc: 94.577
Valid | Loss:0.11292 Acc: 91.667 


  "type " + obj.__name__ + ". It won't be checked "


saving model with acc 91.667
-----------------------------------------------
[ Epoch2: 856/856 ] loss:0.102 acc:48.828 
Train | Loss:0.12009 Acc: 95.171
Valid | Loss:0.10908 Acc: 92.188 
saving model with acc 92.188
-----------------------------------------------
[ Epoch3: 856/856 ] loss:0.126 acc:49.219 
Train | Loss:0.11493 Acc: 95.402
Valid | Loss:0.10638 Acc: 92.231 
saving model with acc 92.231
-----------------------------------------------
[ Epoch4: 856/856 ] loss:0.066 acc:49.219 
Train | Loss:0.11105 Acc: 95.523
Valid | Loss:0.10549 Acc: 92.405 
saving model with acc 92.405
-----------------------------------------------
[ Epoch5: 856/856 ] loss:0.141 acc:48.047 
Train | Loss:0.10739 Acc: 95.675
Valid | Loss:0.10358 Acc: 92.144 
-----------------------------------------------


In [None]:
print("Loading embedding")
glove_embedding = load_embedding("glove")
paragram_embedding = load_embedding("paragram")

print("Combining embedding")
corpus = set(list(glove_embedding.keys()) + list(paragram_embedding.keys()))
init_vector = [0]*embedding_dim
embedding = []
for word in tdqm(corpus):
    glove_vector = glove_embedding.get(word, init_vector)
    paragram_vector = paragram_embedding.get(word, init_vector)
    vector = 0.7*np.asarray(glove_vector) + 0.3*np.asarray(paramgram_vector)
    embedding.append(vector)
del glove_embedding
del paragram_embedding
gc.collect()
%%

In [159]:
# preprocess training data
preprocess = Preprocess(train_x, embedding, embedding_dim)
embedding_matrix = preprocess.make_embedding()
train_x = preprocess.sentence_word2idx(max_len=sen_len)
y = preprocess.labels_to_tensor(y)


%%
#cv
skf = StratifiedKFold(n_split=cv_n)
for train_index, test_index in skf.split(train_x, y):
    X_train, y_train = train_X[]
train_len = int(len(sample_train) * (1-val_ratio))
X_train, X_val, y_train, y_val = train_x[:train_len], train_x[train_len:], y[:train_len], y[train_len:]
print("train size {}, validation size {}".format(train_len, len(sample_train) - train_len))

train_dataset = CustomDataset(X=X_train, y=y_train)
val_dataset = CustomDataset(X=X_val, y=y_val)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 0)

val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)

model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model = model.to(device) 

training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device)

Loading training data
Making embedding...
loop is done48488
total words 48490
train size 219011, validation size 2213

start training, parameter total:12363851, trainable:241351

[ Epoch1: 856/856 ] loss:0.112 acc:47.656 
Train | Loss:0.14956 Acc: 94.577
Valid | Loss:0.11292 Acc: 91.667 


  "type " + obj.__name__ + ". It won't be checked "


saving model with acc 91.667
-----------------------------------------------
[ Epoch2: 856/856 ] loss:0.102 acc:48.828 
Train | Loss:0.12009 Acc: 95.171
Valid | Loss:0.10908 Acc: 92.188 
saving model with acc 92.188
-----------------------------------------------
[ Epoch3: 856/856 ] loss:0.126 acc:49.219 
Train | Loss:0.11493 Acc: 95.402
Valid | Loss:0.10638 Acc: 92.231 
saving model with acc 92.231
-----------------------------------------------
[ Epoch4: 856/856 ] loss:0.066 acc:49.219 
Train | Loss:0.11105 Acc: 95.523
Valid | Loss:0.10549 Acc: 92.405 
saving model with acc 92.405
-----------------------------------------------
[ Epoch5: 856/856 ] loss:0.141 acc:48.047 
Train | Loss:0.10739 Acc: 95.675
Valid | Loss:0.10358 Acc: 92.144 
-----------------------------------------------


### Predict1

In [162]:
from sklearn.metrics import classification_report

test_x = test0["question_text"].apply(tokenize_text)
test_y = test0["target"]

preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()

test_dataset = CustomDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)
print('\nload model ...')
model = torch.load(os.path.join(model_dir, 'ckpt.model'))
outputs = testing(batch_size, test_loader, model, device)

Making embedding...
loop is done48488
total words 48490
sentence count #200000
load model ...


AttributeError: 'list' object has no attribute 'tolist'

In [165]:
from sklearn.metrics import f1_score
print(f1_score(test_y, outputs))

0.5792168389115387


### Predict on submitting data

In [156]:
print("Loading test data")
test = pd.read_csv(data_path + "test.csv")
test_x = test["question_text"].apply(tokenize_text)

preprocess = Preprocess(test_x, embedding, embedding_dim)
embedding = preprocess.make_embedding()
test_x = preprocess.sentence_word2idx(max_len = sen_len)

test_dataset = CustomDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)
print('\nload model ...')
model = torch.load(os.path.join(model_dir, 'qiqc1.model'))
outputs = testing(batch_size, test_loader, model, device)

tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(output_path, 'predict.csv'), index=False)
print("Finish Predicting")


Loading test data
Making embedding...
loop is done48488
total words 48490
sentence count #375806
load model ...
save csv ...
Finish Predicting
