In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm_notebook as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sub = pd.read_csv("data/sample_submission.csv")

train_df["title1_en"] = train_df["title1_en"].apply(lambda x: x.replace('""', '').replace('"', '').replace("!", "").replace("?", ""))
train_df["title2_en"] = train_df["title2_en"].apply(lambda x: x.replace('""', '').replace('"', '').replace("!", "").replace("?", ""))

test_df["title1_en"] = train_df["title1_en"].apply(lambda x: x.replace('""', '').replace('"', '').replace("!", "").replace("?", ""))
test_df["title2_en"] = train_df["title2_en"].apply(lambda x: x.replace('""', '').replace('"', '').replace("!", "").replace("?", ""))

train_df.replace('unrelated', 0, inplace=True)
train_df.replace('agreed', 1, inplace=True)
train_df.replace('disagreed', 2, inplace=True)

y = list(train_df["label"])

display(train_df.head()) 
display(test_df.head())
display(sub.head())

Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
0,0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,Police disprove bird's nest congress each pers...,0
1,3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"If you do not come to Shenzhen, sooner or late...",Shenzhen's GDP outstrips Hong Kong Shenzhen St...,0
2,1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"If you do not come to Shenzhen, sooner or late...",The GDP overtopped Hong Kong Shenzhen clarifie...,0
3,2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"If you do not come to Shenzhen, sooner or late...",Shenzhen's GDP topped Hong Kong last year Shen...,0
4,9,6,7,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",吃了30年食用油才知道，一片大蒜轻松鉴别地沟油,How to discriminate oil from gutter oil by mea...,It took 30 years of cooking oil to know that o...,1


Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en
0,321187,167562,59521,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,There are two new old-age insurance benefits f...,Police disprove bird's nest congress each pers...
1,321190,167564,91315,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,"If you do not come to Shenzhen, sooner or late...",Shenzhen's GDP outstrips Hong Kong Shenzhen St...
2,321189,167563,167564,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,"If you do not come to Shenzhen, sooner or late...",The GDP overtopped Hong Kong Shenzhen clarifie...
3,321193,167564,160994,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,"If you do not come to Shenzhen, sooner or late...",Shenzhen's GDP topped Hong Kong last year Shen...
4,321191,167564,15084,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,How to discriminate oil from gutter oil by mea...,It took 30 years of cooking oil to know that o...


Unnamed: 0,Id,Category
0,347448,unrelated
1,347449,unrelated
2,359100,unrelated
3,359101,unrelated
4,359102,unrelated


In [None]:
c = Counter(train_df["title2_en"])
c

In [3]:
train_t1 = train_df["title1_en"]
train_t2 = train_df["title2_en"]

test_t1 = test_df["title1_en"]
test_t2 = test_df["title2_en"]

label = train_df["label"]


word_to_ix = {}
for title1, title2 in zip(tqdm(train_t1), train_t2):
    for word in title1.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1
    for word in title2.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1
            
            
for title1, title2 in zip(tqdm(test_t1), test_t2):
    for word in title1.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1
    for word in title2.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1

len(word_to_ix)

HBox(children=(IntProgress(value=0, max=320552), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80126), HTML(value='')))




99853

In [22]:
# Dataset
class TitleDataset(Dataset):
    """Face Landmarks dataset."""
 
    def __init__(self, titles1, titles2, labels, dic=None, transform=None, seq_length=50, if_test=False):

        self.titles1 = titles1
        self.titles2 = titles2
        self.labels = labels
        self.transform = transform
        self.dic=dic
        self.seq_length=seq_length
        self.if_test=if_test
 
    def __len__(self):
        return len(self.titles1)
 
    def __getitem__(self, idx):
        title1 = self.titles1[idx]
        title2 = self.titles2[idx]
        
        if self.if_test:
            # dummy label
            label = title1
        else:
            label = torch.tensor(self.labels[idx])

        sample = {'t1': title1, 't2': title2, 'label': label}
 
        if self.transform:
            sample = self.transform(sample, self.dic, max_seq_length=self.seq_length)
 
        return sample


class Toidx(object):
    def __call__(self, sample, word_to_idx, max_seq_length=50):
    
        def prepare_sequence(seq, to_ix):
            #zero padding and word--->ix in seq.
            idxs = [to_ix[w] for w in seq.split()]
            if len(idxs) > max_seq_length:
                idxs = idxs[:max_seq_length] 
            else:
                idxs += [0] * (max_seq_length - len(idxs))
            return torch.tensor(idxs, dtype=torch.long)
        
        t1, t2, label = sample['t1'], sample['t2'], sample["label"]
        return {'t1': prepare_sequence(t1, word_to_idx), 't2': prepare_sequence(t2, word_to_idx), 'label': label}



In [5]:
class LSTM_Classifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size=3, seq_length=50):
        super(LSTM_Classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)


        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=False)
        self.lstm2 = nn.LSTM(embedding_dim, hidden_dim, batch_first=False)

        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, target_size)
        self.initial_hidden = self.init_hidden()
        
        
        self.seq_length=seq_length

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence1, sentence2):
        embeds1 = self.word_embeddings(sentence1)
        embeds2 = self.word_embeddings(sentence2)
        #print("embedding size:",embeds1.size(), len(sentence1))
        
        embeds1 = embeds1.view(self.seq_length, len(sentence1), self.embedding_dim)
        embeds2 = embeds2.view(self.seq_length, len(sentence1), self.embedding_dim)

        lstm_out1, self.hidden = self.lstm1(embeds1)#, self.initial_hidden)
        lstm_out2, self.hidden = self.lstm2(embeds2)#, self.initial_hidden)

        concat = torch.cat((lstm_out1[-1], lstm_out2[-1]), dim=1)
        #print("lstm out:", lstm_out1[-1].size())
        #print("concat:", concat.size())
        
        fc1 = F.relu(self.fc1(concat))
        fc2 = F.relu(self.fc2(fc1))
        
        class_scores = F.log_softmax(fc2)
        return class_scores

In [None]:
EMBEDDING_DIM = 256
HIDDEN_DIM = 128
max_seq_length = 50

title1_en = list(train_df["title1_en"])
title2_en = list(train_df["title2_en"])



train1_en, val1_en, train2_en, val2_en, y_train, y_val = train_test_split(title1_en, title2_en, y, test_size=0.2, random_state=42)
print("training data:{}, validation data:{}".format(len(y_train), len(y_val)))


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)

model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)
model.to(device)

loss_function = nn.NLLLoss()
#loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


train_dataset = TitleDataset(train1_en, train2_en, y_train, dic=word_to_ix, transform=Toidx(), seq_length=max_seq_length)
val_dataset = TitleDataset(val1_en, val2_en, y_val, dic=word_to_ix, transform=Toidx(), seq_length=max_seq_length)


batch=64
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)


def train(epoch):
    model.train()

    for batch_idx, sample_batch in enumerate(train_loader):
        #print("batch_idx:",batch_idx)
        en_title1 = sample_batch["t1"].to(device)
        en_title2 = sample_batch["t2"].to(device)
        y = sample_batch["label"].to(device)
        
        optimizer.zero_grad()
        outputs = model(en_title1, en_title2)
        
        loss = loss_function(outputs, y)
        loss.backward()
        optimizer.step()
    print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
            
    return model
    
        

def test():
    with torch.no_grad():
        model.eval()
        test_loss = 0
        correct = 0
        for batch_idx, sample_batch in enumerate(val_loader):
            en_title1 = sample_batch["t1"].to(device)
            en_title2 = sample_batch["t2"].to(device)
            y = sample_batch["label"].to(device)
            
            output = model(en_title1, en_title2)

            # sum up batch loss
            test_loss += loss_function(output, y).item()
            # get the index of the max log-probability
            pred = output.max(1, keepdim=True)[1]
            #print(pred.eq(y.view_as(pred)).sum().item())
            correct += pred.eq(y.view_as(pred)).sum().item()

        #test_loss /= len(val_loader.dataset)
        test_loss /= batch_idx
        print('Validation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
              .format(test_loss, correct, len(val_loader.dataset),
                      100. * correct / len(val_loader.dataset)))    
        return test_loss

        
def save_model(model, path="model/LSTM.model"):
    torch.save(model, path)  
        
lowest_loss = 1000000000
for epoch in range(100):
    #print(epoch+1)
    model = train(epoch)
    val_loss = test()
    
    if val_loss < lowest_loss:
        #print("saving model...")
        lowest_loss = val_loss
        save_model(model)

    


training data:256441, validation data:64111
device: cuda:0




epoch:1,train_loss:0.6205
Validation set: Average loss: 0.7169, Accuracy: 43938/64111 (69%)



  "type " + obj.__name__ + ". It won't be checked "


epoch:2,train_loss:0.6807
Validation set: Average loss: 0.7168, Accuracy: 43938/64111 (69%)

epoch:3,train_loss:0.7096
Validation set: Average loss: 0.7168, Accuracy: 43938/64111 (69%)

epoch:4,train_loss:0.6901
Validation set: Average loss: 0.7168, Accuracy: 43938/64111 (69%)

epoch:5,train_loss:0.7549
Validation set: Average loss: 0.7167, Accuracy: 43938/64111 (69%)

epoch:6,train_loss:0.7696
Validation set: Average loss: 0.7168, Accuracy: 43938/64111 (69%)

epoch:7,train_loss:0.6376
Validation set: Average loss: 0.7170, Accuracy: 43938/64111 (69%)

epoch:8,train_loss:0.6353
Validation set: Average loss: 0.7167, Accuracy: 43938/64111 (69%)

epoch:9,train_loss:0.7533
Validation set: Average loss: 0.7167, Accuracy: 43938/64111 (69%)

epoch:10,train_loss:0.6818
Validation set: Average loss: 0.7167, Accuracy: 43938/64111 (69%)

epoch:11,train_loss:0.6026
Validation set: Average loss: 0.7167, Accuracy: 43938/64111 (69%)

epoch:12,train_loss:0.7536
Validation set: Average loss: 0.7168, Acc

In [59]:
c = Counter(y_val)
c

Counter({1: 18482, 0: 43938, 2: 1691})

In [49]:
#推論
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


EMBEDDING_DIM = 256
HIDDEN_DIM = 128
max_seq_length = 50

model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)

PATH = "model/LSTM.model"
model = torch.load(PATH)
model.eval()



title1_en_test = list(test_df["title1_en"])
title2_en_test = list(test_df["title2_en"])
id_ = test_df["id"]

# test dataset. label is None.
test_dataset = TitleDataset(title1_en_test, title2_en_test , None, dic=word_to_ix, transform=Toidx(), seq_length=max_seq_length, if_test=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

with torch.no_grad():
    model.eval()
    predictions = []
    for sample_batch in tqdm(test_loader):
        en_title1 = sample_batch["t1"].to(device)
        en_title2 = sample_batch["t2"].to(device)
        output = model(en_title1, en_title2)
        
        pred = output.max(1, keepdim=True)[1].cpu()
        #print(output.cpu(), pred)
        predictions.extend(list(pred.numpy()))
        
#'unrelated', 0
#'agreed', 1
#'disagreed', 2

new_predictions = []
for p in predictions:
    if p == 0:
        new_predictions.append("unrelated")
    elif p==1:
        new_predictions.append("agreed")
    elif p==2:
        new_predictions.append("disagreed")        

HBox(children=(IntProgress(value=0, max=626), HTML(value='')))



In [51]:
c = Counter(new_predictions)
c

Counter({'unrelated': 80126})

In [50]:
submit_csv = pd.concat([id_, pd.Series(new_predictions)], axis=1)
#display(submit_csv)

submit_csv.columns = ["Id", "Category"]
submit_csv.to_csv("submit.csv", header=True, index=False)
submit = pd.read_csv("submit.csv")
submit

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated
5,321194,unrelated
6,321192,unrelated
7,321197,unrelated
8,321195,unrelated
9,321199,unrelated


In [27]:
sub

Unnamed: 0,Id,Category
0,347448,unrelated
1,347449,unrelated
2,359100,unrelated
3,359101,unrelated
4,359102,unrelated
5,359103,unrelated
6,359104,unrelated
7,359105,unrelated
8,359106,unrelated
9,359107,unrelated
