In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.autograd import Variable
import pandas as pd
import numpy as np
torch.manual_seed(777)
from gensim.models import Word2Vec
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import DataLoader,Dataset
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score,precision_score,recall_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from pprint import pprint

from gensim.models import KeyedVectors
%matplotlib inline

In [4]:
import pickle

with open("../pickle_data/charlevel.pickle", "rb") as f:
    token_data = pickle.load(f)

In [5]:
token_data.keys()

dict_keys(['char_chat', 'idx2char', 'char2idx', 'target'])

In [6]:
comment=token_data['char_chat']
output=token_data['target']
word2idx=token_data['char2idx']
idx2word=token_data['idx2char']

In [8]:
li1=[]
question_list=[]
for i in comment:
    for k in i:
        li1.append(word2idx[k])
    
    question_list.append(li1)
    li1=[]

In [10]:
model_word2vec=Word2Vec.load('../word2vec_model/char_model_200.model')

In [11]:
model_word2vec.vector_size

200

In [12]:
embedding_matrix = []

for word in word2idx.keys():
    try:
        embedding_matrix.append(model_word2vec[word])
    except:
        embedding_matrix.append(np.zeros(200))

print(len(word2idx))
print(len(embedding_matrix))

embedding_matrix = torch.Tensor(embedding_matrix)

1962
1962


  """


In [13]:
class Conv_net_1d(nn.Module):
    def __init__(self,out_channels,num_class,input_size,embedding_matrix,filter_list):
        
        super(Conv_net_1d,self).__init__()
        
        self.embed=nn.Embedding.from_pretrained(embeddings=embedding_matrix)
        self.filter_1= nn.Sequential(
                            nn.Conv1d(in_channels=1,stride=200,padding=0,kernel_size=filter_list[0]*input_size,out_channels=out_channels),
                            nn.BatchNorm1d(out_channels),
                            nn.Tanh()
                        )
        self.filter_2= nn.Sequential(
                            nn.Conv1d(in_channels=1,stride=200,padding=0,kernel_size=filter_list[1]*input_size,out_channels=out_channels),
                            nn.BatchNorm1d(out_channels),
                            nn.Tanh()
                        )
        self.filter_3= nn.Sequential(
                            nn.Conv1d(in_channels=1,stride=200,padding=0,kernel_size=filter_list[2]*input_size,out_channels=out_channels),
                            nn.BatchNorm1d(out_channels),
                            nn.Tanh()
                        )
        
#        self.dropout=nn.Dropout(0.9)
        self.fc=nn.Linear(out_channels*len(filter_list),num_class)
        
    def forward(self,x):
        x=self.embed(x)
        
        x_cat=x.reshape(batch_size, 1, -1)
        
        out=[self.filter_1(x_cat),self.filter_2(x_cat),self.filter_3(x_cat)]
        
        out=[F.max_pool1d(conv,(conv.size(2),)).squeeze(2) for conv in out]
        
        out=torch.cat(out,1)
        
 #       out=self.dropout(out)
        
        out=self.fc(out)
        
        out=F.softmax(out,dim=1)
        
        return out
            
    def predict(self,x,test_batch_size):
    
        x=self.embed(x)
        x_cat=x.reshape(test_batch_size, 1, -1)
        
        out=[self.filter_1(x_cat),self.filter_2(x_cat),self.filter_3(x_cat)]
        out=[F.max_pool1d(conv,(conv.size(2),)).squeeze(2) for conv in out]
        
        out = torch.cat(out, 1)
        out = self.fc(out)
        out = F.softmax(out, dim=1)
        
        
        
        return out
    

In [14]:
X_train,X_test,y_train,y_test=train_test_split(question_list,output,test_size=0.26,random_state=777,stratify=output)

In [15]:
len(question_list)

45427

In [16]:
class Dataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = Dataset(np.array(X_train), y_train)
test_data = Dataset(np.array(X_test), y_test)

In [17]:
len(X_train)

33615

In [18]:
def pad_sequence(batch):
        X_batch, y_batch = zip(*batch)
        max_seq_length = max([len(x) for x in X_batch])
        if max_seq_length < max(filter_list):
            max_seq_length = max(filter_list)

        res = []
        for seq in X_batch:
            if len(seq) < max_seq_length:
                pad_seq = torch.LongTensor(seq + [0]*(max_seq_length-len(seq)))
                res.append(pad_seq)
            else:
                res.append(torch.LongTensor(seq))
        return torch.cat(res).reshape(batch_size, max_seq_length), torch.LongTensor(y_batch)

In [32]:
model=Conv_net_1d(embedding_matrix=embedding_matrix,filter_list=[3,3,4],num_class=2,out_channels=25,input_size=200).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(),lr=0.003,weight_decay=1e-4) # use L2-Norm

In [33]:
filter_list=[3,3,4]
filter_sizes=[3,3,4]

In [34]:
batch_size=27*5
test_batch_size=1

In [35]:
train_loader=DataLoader(dataset=train_data, batch_size=27*5, shuffle=True, collate_fn=pad_sequence)
test_loader=DataLoader(dataset=test_data, batch_size=1, shuffle=False)

In [36]:
%%time
for epoch in range(100):
    train_correct = 0
    train_count = 0
    train_loss = 0
    val_correct = 0
    val_count = 0
    val_loss = 0   
    
        
    train_y_pred=[]
    train_y_true=[]
    
    val_y_pred=[]
    val_y_true=[]
    
    # Training
    model = model.train()
    for X_batch, y_batch in train_loader:
        model.zero_grad()
        inputs = Variable(X_batch).to(device)
        targets = Variable(y_batch).to(device)
        
        preds = model(inputs)
     
        loss = criterion(preds, targets)
        train_loss += loss.item()
        
        compare_list=[]
        target_list=[]
        
        compare = torch.max(preds, 1)[1]
        
        compare_list.append([i.item() for i in compare])
        compare_list=compare_list[0]
        target_list.append([i.item() for i in targets])
        target_list=target_list[0]
        
        train_correct+=torch.sum(compare==targets).item()
        train_count += X_batch.size(0)
        
        train_y_pred.extend(compare_list)
        train_y_true.extend(target_list)
        
        loss.backward()
        optimizer.step()
    
    train_acc = train_correct/train_count
    avg_train_loss = train_loss/(train_count/batch_size)
 #   print(confusion_matrix(y_pred=train_y_pred,y_true=train_y_true))
    f1=f1_score(y_pred=train_y_pred,y_true=train_y_true)
    
    print('========= Update finished! ===========')
    
    model = model.eval()
    with torch.no_grad():          
        for X_batch, y_batch in test_loader:
            val_count += 1         
            if len(X_batch) < max(filter_list):
                X_batch = torch.LongTensor(X_batch + [word2idx.get("<pad>")]*(max(filter_sizes)-len(X_batch))).to(device)
            else:
                X_batch = torch.LongTensor(X_batch).to(device)
            inputs = Variable(X_batch).to(device)
            target = Variable(torch.LongTensor(y_batch)).to(device)
            pred = model.predict(inputs, test_batch_size=1)

            loss = criterion(pred, target)
            val_loss += loss.item()
            
            test_pred_list=[]
            test_true_list=[]
            
            _, pred = torch.max(pred, 1)
            true = y_batch.item()
            if true == pred.item():
                val_correct +=1
                          
            test_pred_list.append([i.item() for i in pred])
            test_pred_list=test_pred_list[0]
            test_true_list.append([i.item() for i in target])
            test_true_list=test_true_list[0]
            
            val_y_pred.extend(test_pred_list)
            val_y_true.extend(test_true_list)
            
        val_acc = val_correct/val_count
        avg_val_loss = val_loss/val_count
        
        f1_val=f1_score(y_pred=val_y_pred,y_true=val_y_true)
        
    if epoch % 1 == 0 or epoch == (10-1):
  #      plt.show()
  #      plt.figure(figsize=(10,5))
  #      sns.heatmap(confusion_matrix(y_pred=train_y_pred,y_true=train_y_true),cmap='summer',annot=True)
        
        print('epoch: {:d}'.format(epoch))
        print('train_acc: {:.3f} ({:d}/{:d})'.format(train_acc, train_correct, train_count))
        
        print('train_f1: {:.3f}' .format(f1))
        print('--------------------------')
        
  #      sns.heatmap(confusion_matrix(y_pred=val_y_pred,y_true=val_y_true),cmap='summer',annot=True)
        print('val_acc: {:.3f} ({:d}/{:d})'.format(val_acc, val_correct, val_count))
        print('val_f1: {:.3f}' .format(f1_val))
        print('avg_train_loss: {:.3f}'.format(avg_train_loss))        
        print('avg_val_loss: {:.3f}'.format(avg_val_loss))
        print("==========================================================")
        print('\n')
    

epoch: 0
train_acc: 0.780 (26218/33615)
train_f1: 0.125
--------------------------
val_acc: 0.774 (9139/11812)
val_f1: 0.312
avg_train_loss: 0.532
avg_val_loss: 0.534


epoch: 1
train_acc: 0.794 (26686/33615)
train_f1: 0.229
--------------------------
val_acc: 0.783 (9245/11812)
val_f1: 0.349
avg_train_loss: 0.516
avg_val_loss: 0.525


epoch: 2
train_acc: 0.803 (26996/33615)
train_f1: 0.321
--------------------------
val_acc: 0.788 (9312/11812)
val_f1: 0.374
avg_train_loss: 0.507
avg_val_loss: 0.520


epoch: 3
train_acc: 0.811 (27251/33615)
train_f1: 0.378
--------------------------
val_acc: 0.788 (9302/11812)
val_f1: 0.386
avg_train_loss: 0.500
avg_val_loss: 0.520


epoch: 4
train_acc: 0.815 (27387/33615)
train_f1: 0.405
--------------------------
val_acc: 0.790 (9329/11812)
val_f1: 0.379
avg_train_loss: 0.495
avg_val_loss: 0.517


epoch: 5
train_acc: 0.822 (27642/33615)
train_f1: 0.444
--------------------------
val_acc: 0.782 (9232/11812)
val_f1: 0.414
avg_train_loss: 0.488
avg_val_

epoch: 31
train_acc: 0.879 (29541/33615)
train_f1: 0.676
--------------------------
val_acc: 0.778 (9184/11812)
val_f1: 0.451
avg_train_loss: 0.435
avg_val_loss: 0.529


epoch: 32
train_acc: 0.879 (29559/33615)
train_f1: 0.678
--------------------------
val_acc: 0.776 (9171/11812)
val_f1: 0.443
avg_train_loss: 0.435
avg_val_loss: 0.528


epoch: 33
train_acc: 0.881 (29608/33615)
train_f1: 0.682
--------------------------
val_acc: 0.777 (9182/11812)
val_f1: 0.437
avg_train_loss: 0.434
avg_val_loss: 0.529


epoch: 34
train_acc: 0.881 (29622/33615)
train_f1: 0.685
--------------------------
val_acc: 0.778 (9184/11812)
val_f1: 0.433
avg_train_loss: 0.433
avg_val_loss: 0.528


epoch: 35
train_acc: 0.882 (29635/33615)
train_f1: 0.686
--------------------------
val_acc: 0.775 (9149/11812)
val_f1: 0.428
avg_train_loss: 0.433
avg_val_loss: 0.530


epoch: 36
train_acc: 0.882 (29662/33615)
train_f1: 0.690
--------------------------
val_acc: 0.774 (9148/11812)
val_f1: 0.434
avg_train_loss: 0.432
av

epoch: 62
train_acc: 0.891 (29962/33615)
train_f1: 0.717
--------------------------
val_acc: 0.762 (8995/11812)
val_f1: 0.445
avg_train_loss: 0.423
avg_val_loss: 0.543


epoch: 63
train_acc: 0.892 (29988/33615)
train_f1: 0.719
--------------------------
val_acc: 0.769 (9079/11812)
val_f1: 0.443
avg_train_loss: 0.423
avg_val_loss: 0.537


epoch: 64
train_acc: 0.891 (29964/33615)
train_f1: 0.719
--------------------------
val_acc: 0.752 (8885/11812)
val_f1: 0.448
avg_train_loss: 0.423
avg_val_loss: 0.551


epoch: 65
train_acc: 0.893 (30021/33615)
train_f1: 0.723
--------------------------
val_acc: 0.757 (8945/11812)
val_f1: 0.439
avg_train_loss: 0.421
avg_val_loss: 0.546


epoch: 66
train_acc: 0.894 (30050/33615)
train_f1: 0.724
--------------------------
val_acc: 0.766 (9046/11812)
val_f1: 0.433
avg_train_loss: 0.421
avg_val_loss: 0.539




KeyboardInterrupt: 

In [None]:
.451

In [37]:
# max score= 0.484