In [1]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import seaborn as sns

In [2]:
train_file = pd.read_csv('../data//train.csv')
final_test_file = pd.read_csv('../data/test.csv')


In [3]:
train_file.shape[0]

256442

In [4]:
data_per_class=Counter(train_file['label'])
print("Number of rows grouped by label are:")
for i,label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / train_file.shape[0]*100:.3f})% of total train data")



Number of rows grouped by label are:
unrelated class rows: 175598 (68.475)% of total train data
agreed class rows: 74238 (28.949)% of total train data
disagreed class rows: 6606 (2.576)% of total train data


In [5]:
label_encoding = {"unrelated":0,"agreed":1,"disagreed":2}

# preprocessing to remove special chars and convert text to lowercase.
def preprocessing(txt):
    txt = re.sub('[^a-zA-Z0-9 ]', '', txt)
    txt = txt.lower()
    return txt

# convert text labels to numbers
def convert_labels(txt):
    return label_encoding[txt]


title1_en=train_file['title1_en'].apply(preprocessing)
title2_en = train_file['title2_en'].apply(preprocessing)
train_file['title1_en']=title1_en
train_file['title2_en']=title2_en

title1_en=final_test_file['title1_en'].apply(preprocessing)
title2_en = final_test_file['title2_en'].apply(preprocessing)
final_test_file['title1_en']=title1_en
final_test_file['title2_en']=title2_en

labels=train_file['label'].apply(convert_labels)
train_file['label']=labels

In [4]:
import pytorch_transformers

In [6]:
print(list(dir(pytorch_transformers)))


['AdamW', 'AutoConfig', 'AutoModel', 'AutoModelForQuestionAnswering', 'AutoModelForSequenceClassification', 'AutoModelWithLMHead', 'AutoTokenizer', 'BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BERT_PRETRAINED_MODEL_ARCHIVE_MAP', 'BasicTokenizer', 'BertConfig', 'BertForMaskedLM', 'BertForMultipleChoice', 'BertForNextSentencePrediction', 'BertForPreTraining', 'BertForQuestionAnswering', 'BertForSequenceClassification', 'BertForTokenClassification', 'BertModel', 'BertPreTrainedModel', 'BertTokenizer', 'CONFIG_NAME', 'ConstantLRSchedule', 'Conv1D', 'DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP', 'DistilBertConfig', 'DistilBertForMaskedLM', 'DistilBertForQuestionAnswering', 'DistilBertForSequenceClassification', 'DistilBertModel', 'DistilBertTokenizer', 'GPT2Config', 'GPT2DoubleHeadsModel', 'GPT2LMHeadModel', 'GPT2Model', 'GPT2PreTrainedModel', 'GPT2Tokenizer', 'GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'GPT2_PRETRAINED_MODEL_ARCHIVE_MAP', 'OPENAI_GPT_PRETRAINED_

In [6]:
from transformers import pipeline

In [52]:
translator_en_fr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr")
translator_fr_en = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
translator_en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de")
translator_de_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en")

Downloading: 100%|██████████| 1.30k/1.30k [00:00<00:00, 668kB/s]
Downloading: 100%|██████████| 284M/284M [00:11<00:00, 25.8MB/s] 
Downloading: 100%|██████████| 42.0/42.0 [00:00<00:00, 41.8kB/s]
Downloading: 100%|██████████| 750k/750k [00:00<00:00, 3.50MB/s]
Downloading: 100%|██████████| 778k/778k [00:00<00:00, 3.77MB/s]
Downloading: 100%|██████████| 1.21M/1.21M [00:00<00:00, 5.47MB/s]
Downloading: 100%|██████████| 1.11k/1.11k [00:00<00:00, 557kB/s]
Downloading: 100%|██████████| 284M/284M [00:09<00:00, 30.4MB/s] 
Downloading: 100%|██████████| 42.0/42.0 [00:00<00:00, 10.5kB/s]
Downloading: 100%|██████████| 778k/778k [00:00<00:00, 4.53MB/s]
Downloading: 100%|██████████| 750k/750k [00:00<00:00, 4.87MB/s]
Downloading: 100%|██████████| 1.21M/1.21M [00:00<00:00, 5.93MB/s]


In [41]:
train_file['title1_en'][0:2].to_list()


['there are two new oldage insurance benefits for old people in rural areas have you got them',
 'if you do not come to shenzhen sooner or later your son will also come in less than 10 years shenzhen per capita gdp will exceed hong kong']

In [54]:
print(train_file['title1_en'][0:2].to_list())

en_fr = translator_en_fr(train_file['title1_en'][0:2].to_list())
trans_en_fr = []
[trans_en_fr.append(x['translation_text']) for x in en_fr]

fr_en = translator_fr_en(trans_en_fr)
trans_fr_en=[]
[trans_fr_en.append(x['translation_text']) for x in fr_en]

en_de = translator_en_de(train_file['title1_en'][0:2].to_list())
trans_en_de = []
[trans_en_de.append(x['translation_text']) for x in en_de]

de_en = translator_de_en(trans_en_de)
trans_de_en = []
[trans_de_en.append(x['translation_text']) for x in de_en]


print(trans_en_fr)
print(trans_fr_en)
print(trans_en_de)
print(trans_de_en)


['there are two new oldage insurance benefits for old people in rural areas have you got them', 'if you do not come to shenzhen sooner or later your son will also come in less than 10 years shenzhen per capita gdp will exceed hong kong']
["Il y a deux nouvelles prestations d'assurance vieillesse pour les personnes âgées dans les zones rurales.", 'si vous ne venez pas à Shenzhen tôt ou tard votre fils viendra aussi dans moins de 10 ans Shenzhen par habitant gdp dépassera hong kong']
['There are two new old-age insurance benefits for the elderly in rural areas.', "if you don't come to Shenzhen sooner or later your son will also come in less than 10 years Shenzhen per inhabitant gdp will exceed Hong Kong"]
['Es gibt zwei neue Altersversicherung Leistungen für alte Menschen in ländlichen Gebieten haben Sie sie', 'wenn Sie nicht früher oder später nach Shenzhen kommen, wird Ihr Sohn auch in weniger als 10 Jahren Shenzhen pro Kopf gdp werden hong kong überschreiten']
['There are two new old 

In [33]:
ccc=Counter(train_file['title1_en'])


In [34]:
len(ccc)


62648

In [35]:
no_c=0
new_dict={}
for i,j in enumerate(ccc):
    if(ccc[j] > 1):
        new_dict[j] = ccc[j]

        no_c+=1



In [38]:
no_c
len(new_dict)
new_dict


{'if you do not come to shenzhen sooner or later your son will also come in less than 10 years shenzhen per capita gdp will exceed hong kong': 4,
 'how to discriminate oil from gutter oil by means of garlic': 3,
 'it took 30 years of cooking oil to know that one piece of garlic is easy to spot': 4,
 'a single piece of garlic can spot gutter oil come on do the following to keep you out of the gutter oil': 3,
 'its very practical to use a single piece of garlic to distinguish oil from oil ': 2,
 'do you know how harmful it is to drink alcohol when children drink': 2,
 'note to parents this hidden trouble must be removed dont let it endanger the childs health': 11,
 ' farmer  how much per acre per acre for farmers in yaozhou district tongchuan city': 2,
 'a thousand dollars per acre the farmers here are happy': 4,
 ' showdown   its a good day to start with breakfast  the sequel theres a lot of lights a little black a little white': 17,
 'do not spend money to eat a black meat skin is less

In [6]:
import spacy
tokenizer = spacy.load('en_core_web_sm')
def tokenize(text):
    return [token.text for token in tokenizer.tokenizer(text)]


counts = Counter()
for index, row in train_file[['title1_en','title2_en']].iterrows():
    # print()
    counts.update(tokenize(row['title1_en']+" "+row['title2_en']))

# deleted_words={}
# print(len(counts.keys()))
# for keys in list(counts):
#     if counts[keys]<5:
#         deleted_words[keys] = counts[keys]
#         del counts[keys]
# print(len(counts.keys()))
# print(deleted_words)

vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)


def encode_sentence(text, vocab2index, max_length=90):
    tokenized = tokenize(text)
    encoded = np.zeros(max_length, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"])
                    for word in tokenized])
    length = min(max_length, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length


In [7]:
sample_sentence = train_file[['title1_en','title2_en','label']]
print(sample_sentence['title1_en'][0]+" - "+ sample_sentence['title2_en'][0]+"--"+str(sample_sentence['label'][0]))
print(len(sample_sentence['title1_en'][0]+" - " +
      sample_sentence['title2_en'][0]+"--"+str(sample_sentence['label'][0])))

sample_sentence = sample_sentence['title1_en'][0]+" "+ sample_sentence['title2_en'][0]
print(encode_sentence(sample_sentence, vocab2index))
sample_sentence2="old people oldage for rural there police"
print(encode_sentence(sample_sentence2, vocab2index))


there are two new oldage insurance benefits for old people in rural areas have you got them - police disprove birds nest congress each person gets 50000 yuan still old people insist on going to beijing--0
204
(array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 10, 11, 30, 31, 32, 33,
       34,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0]), 35)
(array([10, 11,  6,  9, 13,  2, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      

In [8]:
def join_encode_text(text):
    # print(type(text))
   combibed_text=(text['title1_en']+" "+text['title2_en'])
   return encode_sentence(combibed_text, vocab2index)[0]



train_file[['title1_en', 'title2_en']].apply(join_encode_text,axis=1)
train_file['encoded_titles'] = train_file[['title1_en', 'title2_en']].apply(join_encode_text, axis=1)
# train_file.head()


In [9]:
train_file.head(2)

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label,encoded_titles
0,195611,0,1,there are two new oldage insurance benefits fo...,police disprove birds nest congress each perso...,0,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,191474,2,3,if you do not come to shenzhen sooner or later...,shenzhens gdp outstrips hong kong shenzhen sta...,0,"[35, 16, 36, 37, 38, 33, 39, 40, 41, 42, 43, 4..."


In [10]:
test_split_size=0.3 # percentage of test split
train_split,test_split=train_test_split(train_file, test_size=test_split_size,stratify=train_file['label'],random_state=9)


In [11]:
# To be removed
data_per_class = Counter(train_split['label'])
print("Number of rows grouped by label are:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / train_split.shape[0]*100:.3f})% of total train data")

data_per_class = Counter(test_split['label'])
print("Number of rows grouped by label are:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / test_split.shape[0]*100:.3f})% of total train data")

Number of rows grouped by label are:
0 class rows: 122918 (68.475)% of total train data
1 class rows: 51967 (28.950)% of total train data
2 class rows: 4624 (2.576)% of total train data
Number of rows grouped by label are:
0 class rows: 52680 (68.475)% of total train data
2 class rows: 1982 (2.576)% of total train data
1 class rows: 22271 (28.949)% of total train data


In [12]:
# Checking if cuda supported GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [13]:
# Creating customised Image dataset to be used for dataloader
class CustomDataset(Dataset):
    def __init__(self, data_frame):
        self.data = data_frame['encoded_titles']
        self.label = data_frame['label']
        self.length=len(data_frame)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        text_data = torch.tensor(self.data[idx])
        label = torch.tensor(self.label[idx])
        data=(text_data,label)
        return data
    

In [14]:
train_split.reset_index()


Unnamed: 0,index,id,tid1,tid2,title1_en,title2_en,label,encoded_titles
0,197132,240811,130541,86108,japan wants to blow up the three gorges dam,japan is threatening to blow up the three gorg...,0,"[812, 695, 33, 2553, 186, 64, 406, 14726, 3670..."
1,97152,13503,64765,65649,cctv is broadcast 60 under the arrest fine the...,the police have discredited rumors that the un...,0,"[5463, 66, 5456, 3513, 1603, 64, 1607, 3039, 6..."
2,98265,71106,65402,65408,8 00 krypton drops and ofo acquisitions are ne...,denial of future acquisition rumors tiger glob...,0,"[2041, 912, 17909, 511, 167, 14212, 14528, 3, ..."
3,60437,21970,41503,41504,aeroplane crash in shanghai hongqiao airport,aeroplane landing process in shanghai hongqiao...,0,"[23191, 4951, 12, 747, 21163, 2308, 23191, 570..."
4,22656,21585,15176,15182,wash it when you wash it even if you have gray...,eat the salt and its shampoo the white hair tu...,0,"[2700, 109, 219, 16, 2700, 109, 295, 35, 16, 1..."
...,...,...,...,...,...,...,...,...
179504,61816,166852,42519,42518,phone calls dont want to pick up and is too as...,phone calls dont want to pick up teach you to ...,1,"[598, 7209, 36, 134, 365, 33, 1230, 186, 167, ..."
179505,221247,179586,145652,12203,a professor at tsinghua university is tremulou...,dressed up for a long time can really become t...,0,"[70, 2709, 176, 7221, 158, 66, 47073, 9347, 51..."
179506,49199,130975,33620,33642,top 10 quotes they always have a second half o...,these ten famous quotes all have the latter ha...,1,"[559, 49, 8009, 832, 2530, 15, 70, 856, 2921, ..."
179507,57831,165342,39603,39609,wanda claims to wu jing did not expect that wa...,wang jialin sues wujing for 1 billion,1,"[9478, 1403, 33, 433, 6115, 289, 37, 293, 114,..."


In [15]:
train_split_dataset=CustomDataset(train_split.reset_index())
test_split_dataset=CustomDataset(test_split.reset_index())

In [16]:
batch=100
loaders = {
    'train' : DataLoader(train_split_dataset,batch_size=batch, shuffle=True,num_workers=0),
    'test'  : DataLoader(test_split_dataset,batch_size=batch,shuffle=True,num_workers=0),
}

In [17]:
a=loaders['train']

In [32]:
class LSTM_simple(nn.Module):
    def __init__(self,vocab_size,embed_dimension,lstm_units,hidden_dimension):
        super().__init__()
        self.embeddings=nn.Embedding(vocab_size,embed_dimension,padding_idx=0)
        self.lstm_layer1= nn.LSTM(embed_dimension,lstm_units,hidden_dimension,bidirectional=True,batch_first=True)
        self.dropout_layer=nn.Dropout(0.3)
        self.full_layer1 = nn.Linear(hidden_dimension,3)

    def forward(self, text):
        print(text.shape)
        print(text)
        embedded_text=self.embeddings(text)
        print(embedded_text.shape)
        print(embedded_text)
        print(self.lstm_layer1(embedded_text))
        embedded_text = self.dropout_layer(embedded_text)
        lstm_out, (ht, ct) = self.lstm_layer1(embedded_text)
        
        
        # x=self.full_layer1(x)

        return self.full_layer1(ht[-1])



In [19]:
len(vocab2index)


49491

In [20]:
LSTM_simple_model=LSTM_simple(len(vocab2index),281,128,128)

In [21]:
optimizer=optim.Adam(LSTM_simple_model.parameters(),lr=0.0001)
loss_func=nn.CrossEntropyLoss()

In [33]:
num_epochs = 5
training_loss_list = []
train_accuracy_list = []
validation_loss_list = []
validation_accuracy_list = []


def train(num_epochs, LSTM_simple_model, loaders):
    LSTM_simple_model.train()
    # Train the model
    total_step = len(loaders['train'])
    for epoch in range(num_epochs):
        training_loss = []
        train_accuracy = []
        for i, (data, labels) in enumerate(loaders['train']):
            step_acc = 0.0
            # gives batch data, normalize x when iterate train_loader
            b_x = Variable(data)   # batch x
            b_y = Variable(labels)   # batch y

            # clear gradients for this training step
            optimizer.zero_grad()

            output = LSTM_simple_model(b_x) # predicted output from the net
            print(output)

            # pred_y = torch.max(output, 1)[0].data.squeeze()
            # step_acc = (pred_y == b_y).sum().item() / float(labels.size(0))
            # train_accuracy.append(
            #     (pred_y == b_y).sum().item() / float(labels.size(0)))

            #calc cross entropy loss
            loss = loss_func(output, b_y)
            # backpropagation, compute gradients
            loss.backward()     # apply gradients
            optimizer.step()
            training_loss.append(loss.item())

            if (i+1) % 50 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, acc: {:.4f}' .format(
                    epoch + 1, num_epochs, i + 1, total_step, loss.item(), step_acc))

        # v_acc, v_loss = validation()
        # val_acc_ls.append(v_acc)
        # val_loss_ls.append(v_loss)

        train_accuracy = (np.array(train_accuracy).mean())
        train_accuracy_list.append(train_accuracy)
        training_loss = (np.array(training_loss).mean())
        training_loss_list.append(training_loss)

        # val_loss_ls=np.array(val_loss_ls).mean()
        # val_acc_ls=np.array(val_acc_ls).mean()
        # validation_loss_list.append(v_loss)
        # validation_accuracy_list.append(v_acc)

        print(
            # f"Epoch {epoch+1} loss: {training_loss:.4f} acc: {train_accuracy:.4f} val_loss: {v_loss:.4f} val_acc: {v_acc:4f}")
            f"Epoch {epoch+1} loss: {training_loss:.4f} acc: {train_accuracy:.4f}")


train(num_epochs, LSTM_simple_model, loaders)


AttributeError: 'tuple' object has no attribute 'size'

In [27]:
train_split["encoded_titles"][98265]


array([ 2041,   912, 17909,   511,   167, 14212, 14528,     3, 14788,
       18337,  8735,  1213,  8299,    75,  9914,  1809,  1605,  2464,
        1677,   209,    33,   209,  7474,    75,  1862,   939,    62,
        5727,  3194,  3317,   167,  1862,   823,   633,  1465,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0])

In [29]:
from torchsummary import summary
summary(LSTM_simple_model, [train_split["encoded_titles"][98265]])


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)