In [17]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm_notebook as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

import re

In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sub = pd.read_csv("data/sample_submission.csv")

def clean_str(series):
    series = series.str.lower()
    #print(series.head())
    #clean = lambda x: x.replace('""', '').replace('"', '').replace("!", "").replace("?", "").replace("'s", "").replace(",", "").replace(".", "").replace("#", "").replace("→", "").replace("'re'", " are").replace("< / i >", "").replace("< i >", "")
    
    def clean_seq(seq):
        seq = re.sub(r'[,."'']+', '', seq)
        seq = re.sub(r'[0-9]+', '<NUM>', seq) 
        return seq
    
    series = series.apply(clean_seq)
    #print(series.head())
    return series


train_df["title1_en"] = clean_str(train_df["title1_en"])
train_df["title2_en"] = clean_str(train_df["title2_en"])

test_df["title1_en"] = clean_str(test_df["title1_en"])
test_df["title2_en"] = clean_str(test_df["title2_en"])

train_df.replace('unrelated', 0, inplace=True)
train_df.replace('agreed', 1, inplace=True)
train_df.replace('disagreed', 2, inplace=True)

y = list(train_df["label"])

display(train_df.head()) 
display(test_df.head())
#display(sub.head())

In [21]:
train_df["title1_en"][:100]

0     there are two new old-age insurance benefits f...
1     "if you do not come to shenzhen, sooner or lat...
2     "if you do not come to shenzhen, sooner or lat...
3     "if you do not come to shenzhen, sooner or lat...
4     "how to discriminate oil from gutter oil by me...
5     "if you do not come to shenzhen, sooner or lat...
6     "if you eat durian, you will kill yourself if ...
7     "if you do not come to shenzhen, sooner or lat...
8     "frog frog? it's a fertility test! let's play"...
9     "how to discriminate oil from gutter oil by me...
10    "how to discriminate oil from gutter oil by me...
11    "how to discriminate oil from gutter oil by me...
12    "how to discriminate oil from gutter oil by me...
13    "how to discriminate oil from gutter oil by me...
14    "the plane is about to take off. a man knelt d...
15    "the plane is about to take off. a man knelt d...
16       "men kneel at the hatch!" all because of love!
17    "the plane is about to take off. a man kne

In [88]:
id1_train = set(train_df["tid1"])
id2_train = set(train_df["tid2"])
id1_test = set(test_df["tid1"])
id2_test = set(test_df["tid2"])

id1_train & id1_test, len(id2_train & id2_test)

({167562, 167563}, 34148)

In [83]:
len(agree_dic)

69170

In [66]:
c = Counter( train_df["title1_en"])
c

Counter({'there are two new old-age insurance benefits for old people in rural areas have you got them': 1,
         'if you do not come to shenzhen sooner or later your son will also come in less than 10 years shenzhen per capita gdp will exceed hong kong': 5,
         'how to discriminate oil from gutter oil by means of garlic': 6,
         'if you eat durian you will kill yourself if you eat it wrongly': 1,
         'frog frog it a fertility test let play jewel v ': 1,
         'the plane is about to take off a man knelt down at the hatch that was the tear-jerking scene today': 2,
         'the plane is about to take off a man knelt down at the hatch that was the tear-jerking scene': 1,
         'men kneel at the hatch all because of love': 1,
         'the plane is about to take off a man knelt down at the hatch this is the most tear-jerking scene': 1,
         'do you know how harmful it is to drink alcohol when children drink': 3,
         ' farmer  how much per acre per acre for

In [59]:
c = Counter( train_df["title2_en"])
c

Counter({'police disprove bird nest congress each person gets 50 ,000 yuan still old people insist on going to beijing': 5,
         'shenzhen gdp outstrips hong kong shenzhen statistics bureau dismisses rumors: only the gap is narrowing': 4,
         'the gdp overtopped hong kong shenzhen clarified: a little bit more': 4,
         'shenzhen gdp topped hong kong last year shenzhen bureau of statistics refutes rumors: 611 billion': 4,
         'it took 30 years of cooking oil to know that one piece of garlic is easy to spot': 2,
         'shenzhen gdp overtakes hong kong bureau of statistics refutes rumor: unsurpass but the gap shrinks again': 4,
         "durian can't eat with anything , it the same as coffee , it heart disease ": 2,
         'shenzhen gdp outpaces hong kong defending rumors: the gap has narrowed yet again': 4,
         'a store in xianning contains cotton a multi-agency association in chongyang university': 3,
         'a single piece of garlic can spot gutter oil com

### 単語辞書の作成

In [3]:


train_t1 = train_df["title1_en"]
train_t2 = train_df["title2_en"]

test_t1 = test_df["title1_en"]
test_t2 = test_df["title2_en"]

label = train_df["label"]


word_to_ix = {}
for title1, title2 in zip(tqdm(train_t1), train_t2):
    for word in title1.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1
    for word in title2.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1
            
            
for title1, title2 in zip(tqdm(test_t1), test_t2):
    for word in title1.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1
    for word in title2.split():
        if word not in word_to_ix.keys():
            word_to_ix[word] = len(word_to_ix)+1

len(word_to_ix)

HBox(children=(IntProgress(value=0, max=320552), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80126), HTML(value='')))




59645

In [4]:
# Dataset
class TitleDataset(Dataset):
    """Face Landmarks dataset."""
 
    def __init__(self, titles1, titles2, labels, dic=None, transform=None, seq_length=50, if_test=False):

        self.titles1 = titles1
        self.titles2 = titles2
        self.labels = labels
        self.transform = transform
        self.dic=dic
        self.seq_length=seq_length
        self.if_test=if_test
 
    def __len__(self):
        return len(self.titles1)
 
    def __getitem__(self, idx):
        title1 = self.titles1[idx]
        title2 = self.titles2[idx]
        
        if self.if_test:
            # dummy label
            label = title1
        else:
            label = torch.tensor(self.labels[idx])

        sample = {'t1': title1, 't2': title2, 'label': label}
 
        if self.transform:
            sample = self.transform(sample, self.dic, max_seq_length=self.seq_length)
 
        return sample


class Toidx(object):
    def __call__(self, sample, word_to_idx, max_seq_length=50):
    
        def prepare_sequence(seq, to_ix):
            #zero padding and word--->ix in seq.
            idxs = [to_ix[w] for w in seq.split()]
            if len(idxs) > max_seq_length:
                idxs = idxs[:max_seq_length] 
            else:
                idxs += [0] * (max_seq_length - len(idxs))
            return torch.tensor(idxs, dtype=torch.long)
        
        t1, t2, label = sample['t1'], sample['t2'], sample["label"]
        return {'t1': prepare_sequence(t1, word_to_idx), 't2': prepare_sequence(t2, word_to_idx), 'label': label}



In [5]:
class LSTM_Classifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size=3, seq_length=50):
        super(LSTM_Classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)


        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=False, num_layers=1)
        self.lstm2 = nn.LSTM(embedding_dim, hidden_dim, batch_first=False, num_layers=1)

        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim*2)
        self.fc2 = nn.Linear(hidden_dim*2, target_size)
        self.initial_hidden = self.init_hidden()
        
        
        self.seq_length=seq_length

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence1, sentence2):
        embeds1 = self.word_embeddings(sentence1)
        embeds2 = self.word_embeddings(sentence2)
        #print("embedding size:",embeds1.size(), len(sentence1))
        
        embeds1 = embeds1.view(self.seq_length, len(sentence1), self.embedding_dim)
        embeds2 = embeds2.view(self.seq_length, len(sentence1), self.embedding_dim)

        lstm_out1, self.hidden = self.lstm1(embeds1)#, self.initial_hidden)
        lstm_out2, self.hidden = self.lstm2(embeds2)#, self.initial_hidden)

        concat = torch.cat((lstm_out1[-1], lstm_out2[-1]), dim=1)
        #print("lstm out:", lstm_out1[-1].size())
        #print("concat:", concat.size())
        
        fc1 = F.relu(self.fc1(concat))
        fc2 = self.fc2(fc1)
        
        #class_scores = F.log_softmax(fc2)
        return fc2#class_scores

In [7]:
class MLP_Classifier(nn.Module):

    def __init__(self, embedding_dim, vocab_size, target_size=3, seq_length=50):
        super(MLP_Classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)


        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(embedding_dim*2, embedding_dim*2)
        self.fc1_bn = nn.BatchNorm1d(embedding_dim*2)
        self.fc1_drop = nn.Dropout(p=0.5, inplace=False)
        
        self.fc2 = nn.Linear(embedding_dim*2, target_size)        
        
        self.seq_length=seq_length

    def forward(self, sentence1, sentence2):
        embeds1 = self.word_embeddings(sentence1)
        embeds1 = torch.sum(embeds1, 1)
        #print("embed", embeds1.size())
        
        
        embeds2 = self.word_embeddings(sentence2)
        embeds2 = torch.sum(embeds2, 1)

        #print("embedding size:",embeds1.size(), len(sentence1))
        
        #embeds1 = embeds1.view(self.seq_length, len(sentence1), self.embedding_dim)
        #embeds2 = embeds2.view(self.seq_length, len(sentence1), self.embedding_dim)

        concat = torch.cat((embeds1, embeds2), dim=1)
        #print("concat:", concat.size())
        
        fc1 = self.fc1_drop(F.relu(self.fc1_bn(self.fc1(concat))))
        fc2 = self.fc2(fc1)
        
        #class_scores = F.log_softmax(fc2)
        return fc2#class_scores

In [19]:
def make_weights_for_balanced_classes(labels, nclasses):                        
    count = [0] * nclasses                                                      
    for label in labels:                                                         
        count[label] += 1                                                     
    weight_per_class = [0.] * nclasses                                      
    N = float(sum(count))                                                   
    for i in range(nclasses):                                                   
        weight_per_class[i] = N/float(count[i])                                 
    weight = [0] * len(labels)                                              
    for idx, lab in enumerate(labels):                                          
        weight[idx] = weight_per_class[lab]                                  
    return weight              

In [None]:

agree_dic = {}
disagree_dic = {}

#initialize dic
for id1 in tid1:
    agree_dic[id1] = []
    disagree_dic[id1] = []
    
for idx, id1 in enumerate(tqdm(tid1)):
    label = labels[idx]
    id2 = tid2[idx]
    if label == 1:
        agree_dic[id1].append(id2)
    elif label == 2:
        disagree_dic[id1].append(id2)

In [9]:
# Class weight gan be got as : n_samples / (n_classes * np.bincount(y))
# 不均衡データなので
c = Counter(y)
class_weight = []
for label, num in c.items():
    class_weight.append(len(y)/(3*num))    
class_weight = torch.FloatTensor(class_weight).to(device)
print("class weight:", class_weight)

class weight: tensor([ 0.4872,  1.1493, 12.9265], device='cuda:0')


In [11]:
EMBEDDING_DIM = 512
HIDDEN_DIM = 128
max_seq_length = 50

title1_en = list(train_df["title1_en"])
title2_en = list(train_df["title2_en"])
labels = list(train_df["label"])
tid1 = train_df["tid1"]
tid2 = train_df["tid2"]


#weights = make_weights_for_balanced_classes(labels, 3)       
#weights = torch.DoubleTensor(weights)                                       
#sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))       


train1_en, val1_en, train2_en, val2_en, y_train, y_val = train_test_split(title1_en, title2_en, y, test_size=0.2, random_state=42)
print("training data:{}, validation data:{}".format(len(y_train), len(y_val)))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)

#model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)
model = MLP_Classifier(EMBEDDING_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)

model.to(device)


#loss_function = nn.NLLLoss()
loss_function = nn.CrossEntropyLoss(weight=class_weight)
#optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.01)


train_dataset = TitleDataset(train1_en, train2_en, y_train, dic=word_to_ix, transform=Toidx(), seq_length=max_seq_length)

val_dataset = TitleDataset(val1_en, val2_en, y_val, dic=word_to_ix, transform=Toidx(), seq_length=max_seq_length)


batch=256
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)#, sampler = sampler, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False)


def train(epoch):
    model.train()

    for batch_idx, sample_batch in enumerate(train_loader):
        #print("batch_idx:",batch_idx)
        en_title1 = sample_batch["t1"].to(device)
        en_title2 = sample_batch["t2"].to(device)
        y = sample_batch["label"].to(device)
        
        optimizer.zero_grad()
        outputs = model(en_title1, en_title2)
        
        loss = loss_function(outputs, y)
        loss.backward()
        optimizer.step()
        
        
        #optimizer.zero_grad()
        #outputs = model(en_title2, en_title1)
        
        #loss = loss_function(outputs, y)
        #loss.backward()
        #optimizer.step()

    print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
            
    return model
    
        

def test():
    with torch.no_grad():
        model.eval()
        test_loss = 0
        correct = 0
        
        for batch_idx, sample_batch in enumerate(val_loader):
            en_title1 = sample_batch["t1"].to(device)
            en_title2 = sample_batch["t2"].to(device)
            y = sample_batch["label"].to(device)
            
            output = model(en_title1, en_title2)

            # sum up batch loss
            test_loss += loss_function(output, y).item()
            # get the index of the max log-probability
            pred = output.max(1, keepdim=True)[1]
            #print(pred.eq(y.view_as(pred)).sum().item())
            correct += pred.eq(y.view_as(pred)).sum().item()

        #test_loss /= len(val_loader.dataset)
        test_loss /= batch_idx
        accuracy = 100. * correct / len(val_loader.dataset)
        print('Validation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
              .format(test_loss, correct, len(val_loader.dataset),
                      accuracy))    

        return test_loss, accuracy

        
def save_model(model, path="model/MLP.model"):
    torch.save(model, path)  
        
lowest_loss = 1000000000
highest_accuracy = 0
for epoch in range(100):
    #print(epoch+1)
    model = train(epoch)
    val_loss, accuracy = test()

#     if val_loss < lowest_loss:
#         lowest_loss = val_loss
#         save_model(model)

        
    if accuracy > highest_accuracy:
        print("saving model...")
        highest_accuracy = accuracy
        save_model(model)

    

training data:256441, validation data:64111
device: cuda:0
epoch:1,train_loss:0.7302
Validation set: Average loss: 0.5103, Accuracy: 45806/64111 (71%)



  "type " + obj.__name__ + ". It won't be checked "


epoch:2,train_loss:0.4484
Validation set: Average loss: 0.5175, Accuracy: 48035/64111 (75%)

epoch:3,train_loss:0.2458
Validation set: Average loss: 0.5971, Accuracy: 49267/64111 (77%)

epoch:4,train_loss:0.3856
Validation set: Average loss: 0.7100, Accuracy: 51093/64111 (80%)

epoch:5,train_loss:0.2704
Validation set: Average loss: 0.8168, Accuracy: 51496/64111 (80%)

epoch:6,train_loss:0.2016
Validation set: Average loss: 0.9057, Accuracy: 51975/64111 (81%)

epoch:7,train_loss:0.1895
Validation set: Average loss: 1.1602, Accuracy: 52472/64111 (82%)

epoch:8,train_loss:0.2409
Validation set: Average loss: 1.2581, Accuracy: 53263/64111 (83%)

epoch:9,train_loss:0.2250
Validation set: Average loss: 1.2931, Accuracy: 52813/64111 (82%)



KeyboardInterrupt: 

In [12]:
epoch, lowest_loss, highest_accuracy

(9, 1000000000, 83.07934675796665)

In [13]:
#推論
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


EMBEDDING_DIM = 512
HIDDEN_DIM = 128
max_seq_length = 50

#model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)
model = MLP_Classifier(EMBEDDING_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)

#PATH = "model/LSTM.model"
PATH = "model/MLP.model"
model = torch.load(PATH)
print("model loaded.")


title1_en_test = list(test_df["title1_en"])
title2_en_test = list(test_df["title2_en"])
id_ = test_df["id"]

# test dataset. label is None.
test_dataset = TitleDataset(title1_en_test, title2_en_test , None, dic=word_to_ix, transform=Toidx(), seq_length=max_seq_length, if_test=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

with torch.no_grad():
    model.eval()
    predictions = []
    for sample_batch in tqdm(test_loader):
        en_title1 = sample_batch["t1"].to(device)
        en_title2 = sample_batch["t2"].to(device)
        output = model(en_title1, en_title2)
        
        pred = output.max(1, keepdim=True)[1].cpu()
        #print(output.cpu(), pred)
        predictions.extend(list(pred.numpy()))
        
#'unrelated', 0
#'agreed', 1
#'disagreed', 2

new_predictions = []
for p in predictions:
    if p == 0:
        new_predictions.append("unrelated")
    elif p==1:
        new_predictions.append("agreed")
    elif p==2:
        new_predictions.append("disagreed")        

model loaded.


HBox(children=(IntProgress(value=0, max=626), HTML(value='')))




In [14]:
c = Counter(new_predictions)
c

Counter({'unrelated': 50790, 'agreed': 25350, 'disagreed': 3986})

In [15]:
submit_csv = pd.concat([id_, pd.Series(new_predictions)], axis=1)
#display(submit_csv)

submit_csv.columns = ["Id", "Category"]
submit_csv.to_csv("submit.csv", header=True, index=False)
submit = pd.read_csv("submit.csv")
submit

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated
5,321194,unrelated
6,321192,unrelated
7,321197,agreed
8,321195,unrelated
9,321199,unrelated


In [27]:
sub

Unnamed: 0,Id,Category
0,347448,unrelated
1,347449,unrelated
2,359100,unrelated
3,359101,unrelated
4,359102,unrelated
5,359103,unrelated
6,359104,unrelated
7,359105,unrelated
8,359106,unrelated
9,359107,unrelated
