In [67]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.autograd import Variable
import pandas as pd
import numpy as np
torch.manual_seed(777)
from gensim.models import Word2Vec
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import DataLoader,Dataset
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score,precision_score,recall_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from pprint import pprint

from gensim.models import KeyedVectors
%matplotlib inline

In [68]:
import pickle

with open("./pickle_data/quora.pickle", "rb") as f:
    token_data = pickle.load(f)

In [69]:
question=token_data['sentence']
target=token_data['target']
word2idx=token_data['word2idx']
idx2word=token_data['idx2word']


In [70]:
question[:2]

[['How',
  'did',
  'Quebec',
  'nationalist',
  'see',
  'their',
  'province',
  'a',
  'a',
  'nation',
  'in',
  'the',
  '1960s'],
 ['Do',
  'you',
  'have',
  'an',
  'adopted',
  'dog',
  'how',
  'would',
  'you',
  'encourage',
  'people',
  'to',
  'adopt',
  'and',
  'not',
  'shop']]

In [71]:
target[:2]

[0, 0]

In [72]:
li1=[]
question_list=[]
for i in question:
    for k in i:
        li1.append(word2idx[k])
    
    question_list.append(li1)
    li1=[]

In [73]:
model_word2vec=KeyedVectors.load_word2vec_format('./Embedding_model/GoogleNews-vectors-negative300.bin',binary=True)

In [74]:
model_word2vec.vector_size

300

In [75]:
embedding_matrix = []

for word in word2idx.keys():
    try:
        embedding_matrix.append(model_word2vec[word])
    except:
        embedding_matrix.append(np.zeros(300))

print(len(word2idx))
print(len(embedding_matrix))

embedding_matrix = torch.Tensor(embedding_matrix)

227234
227234


In [76]:
_,X,_,y=train_test_split(question_list,target,test_size=0.05,random_state=777,stratify=target)

In [78]:
len(X)

65307

In [79]:
len(y)

65307

In [105]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=777,stratify=y)

In [106]:
class Dataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = Dataset(np.array(X_train), y_train)
test_data = Dataset(np.array(X_test), y_test)

In [107]:
class Conv_net_1d(nn.Module):
    def __init__(self,out_channels,num_class,input_size,embedding_matrix,filter_list):
        super(Conv_net_1d,self).__init__()
        self.embed=nn.Embedding.from_pretrained(embeddings=embedding_matrix)    
        self.filter_1= nn.Sequential(
                            nn.Conv1d(in_channels=1,stride=1,padding=0,kernel_size=filter_list[0]*input_size,out_channels=out_channels),
                            nn.BatchNorm1d(out_channels),
                            nn.ReLU()         
                        )
        
        self.filter_2= nn.Sequential(
                            nn.Conv1d(in_channels=1,stride=1,padding=0,kernel_size=filter_list[1]*input_size,out_channels=out_channels),
                            nn.BatchNorm1d(out_channels),
                            nn.ReLU()
                        )
        
        self.filter_3= nn.Sequential(
                            nn.Conv1d(in_channels=1,stride=1,padding=0,kernel_size=filter_list[2]*input_size,out_channels=out_channels),
                            nn.BatchNorm1d(out_channels),
                            nn.ReLU()
                        )
        
        self.fc=nn.Linear(out_channels*len(filter_list),num_class)
        
    def forward(self,x):
        x=self.embed(x)
        
        x_cat=x.reshape(batch_size, 1, -1)
        
        out=[self.filter_1(x_cat),self.filter_2(x_cat),self.filter_3(x_cat)]
        
        out=[F.max_pool1d(conv,(conv.size(2),)).squeeze(2) for conv in out]
        
        out=torch.cat(out,1)
        
        
        out=self.fc(out)
        
        out=F.softmax(out,dim=1)
        
        return out
            
    def predict(self,x,test_batch_size):
    
        x=self.embed(x)
        x_cat=x.reshape(test_batch_size, 1, -1)
        out=[self.filter_1(x_cat),self.filter_2(x_cat),self.filter_3(x_cat)]
        
        out=[F.max_pool1d(conv,(conv.size(2),)).squeeze(2) for conv in out]
        
        out=torch.cat(out,1)
        
        
        out=self.fc(out)
        
        out=F.softmax(out,dim=1)
        
        return out
    

In [108]:
filter_list=[3,4,5]

In [109]:
filter_sizes=[3,4,5]

In [110]:
 def pad_sequence(batch):
        X_batch, y_batch = zip(*batch)
        max_seq_length = max([len(x) for x in X_batch])
        if max_seq_length < max(filter_list):
            max_seq_length = max(filter_list)

        res = []
        for seq in X_batch:
            if len(seq) < max_seq_length:
                pad_seq = torch.LongTensor(seq + [0]*(max_seq_length-len(seq)))
                res.append(pad_seq)
            else:
                res.append(torch.LongTensor(seq))
        return torch.cat(res).reshape(batch_size, max_seq_length), torch.LongTensor(y_batch)

In [111]:
model=Conv_net_1d(embedding_matrix=embedding_matrix,filter_list=[3,4,5],num_class=2,out_channels=100,input_size=300).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(),lr=0.01) # use L2-Norm

In [114]:
batch_size=114
test_batch_size=1

In [115]:
train_loader=DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True, collate_fn=pad_sequence)
test_loader=DataLoader(dataset=test_data, batch_size=test_batch_size, shuffle=False)

In [None]:
%%time
for epoch in range(10):
    train_correct = 0
    train_count = 0
    train_loss = 0
    val_correct = 0
    val_count = 0
    val_loss = 0   
    
        
    train_y_pred=[]
    train_y_true=[]
    
    val_y_pred=[]
    val_y_true=[]
    
    # Training
    model = model.train()
    for X_batch, y_batch in train_loader:
        model.zero_grad()
        inputs = Variable(X_batch).to(device)
        targets = Variable(y_batch).to(device)
        
        preds = model(inputs)
     
        loss = criterion(preds, targets)
        train_loss += loss.item()
        
        compare_list=[]
        target_list=[]
        
        compare = torch.max(preds, 1)[1]
        
        compare_list.append([i.item() for i in compare])
        compare_list=compare_list[0]
        target_list.append([i.item() for i in targets])
        target_list=target_list[0]
        
        train_correct+=torch.sum(compare==targets).item()
        train_count += X_batch.size(0)
        
        train_y_pred.extend(compare_list)
        train_y_true.extend(target_list)
        
        loss.backward()
        optimizer.step()
    
    train_acc = train_correct/train_count
    avg_train_loss = train_loss/(train_count/batch_size)
 #   print(confusion_matrix(y_pred=train_y_pred,y_true=train_y_true))
    f1=f1_score(y_pred=train_y_pred,y_true=train_y_true)
    
    print('========= Update finished! ===========')
    
    model = model.eval()
    with torch.no_grad():          
        for X_batch, y_batch in test_loader:
            val_count += 1         
            if len(X_batch) < max(filter_list):
                X_batch = torch.LongTensor(X_batch + [word2idx.get("<pad>")]*(max(filter_sizes)-len(X_batch))).to(device)
            else:
                X_batch = torch.LongTensor(X_batch).to(device)
            input = Variable(X_batch).to(device)
            target = Variable(torch.LongTensor(y_batch)).to(device)
            pred = model.predict(input, test_batch_size=1)

            loss = criterion(pred, target)
            val_loss += loss.item()
            
            test_pred_list=[]
            test_true_list=[]
            
            _, pred = torch.max(pred, 1)
            true = y_batch.item()
            if true == pred.item():
                val_correct +=1
                          
            test_pred_list.append([i.item() for i in pred])
            test_pred_list=test_pred_list[0]
            test_true_list.append([i.item() for i in target])
            test_true_list=test_true_list[0]
            
            val_y_pred.extend(test_pred_list)
            val_y_true.extend(test_true_list)
            
        val_acc = val_correct/val_count
        avg_val_loss = val_loss/val_count
        
        f1_val=f1_score(y_pred=val_y_pred,y_true=val_y_true)
        
    if epoch % 1 == 0 or epoch == (10-1):
  #      plt.show()
  #      plt.figure(figsize=(10,5))
  #      sns.heatmap(confusion_matrix(y_pred=train_y_pred,y_true=train_y_true),cmap='summer',annot=True)
        
        print('epoch: {:d}'.format(epoch))
        print('train_acc: {:.3f} ({:d}/{:d})'.format(train_acc, train_correct, train_count))
        
        print('train_f1: {:.3f}' .format(f1))
        print('--------------------------')
        
  #      sns.heatmap(confusion_matrix(y_pred=val_y_pred,y_true=val_y_true),cmap='summer',annot=True)
        print('val_acc: {:.3f} ({:d}/{:d})'.format(val_acc, val_correct, val_count))
        print('val_f1: {:.3f}' .format(f1_val))
        print('avg_train_loss: {:.3f}'.format(avg_train_loss))        
        print('avg_val_loss: {:.3f}'.format(avg_val_loss))
        print("==========================================================")
        print('\n')
    



  'precision', 'predicted', average, warn_for)


epoch: 0
train_acc: 0.936 (42791/45714)
train_f1: 0.007
--------------------------
val_acc: 0.938 (18381/19593)
val_f1: 0.000
avg_train_loss: 0.377
avg_val_loss: 0.375




  'precision', 'predicted', average, warn_for)


