In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
import numpy as np
import time
import pandas as pd
from collections import Counter
from tqdm import tqdm
import json
import jieba
from torch.utils.data import Dataset
import itertools
import os
import datetime
import copy
import matplotlib.pyplot as plt
from sklearn.utils import shuffle 
import math
import matplotlib.pyplot as plt
import sklearn.ensemble
import sklearn.multiclass

In [None]:
#devide the validation and test set.
tra = pd.read_csv('sentiment_analysis_trainingset.csv')
va = pd.read_csv('sentiment_analysis_validationset.csv')
allres = tra.append(va)
ssall = shuffle(allres)
ssall.iloc[:72000,:].to_csv('trainingset_72')
ssall.iloc[72000:96000,:].to_csv('valiset_24')
ssall.iloc[96000:,:].to_csv('testset_24')

In [None]:
label_names = ['location_traffic_convenience', 'location_distance_from_business_district', 'location_easy_to_find',
               'service_wait_time', 'service_waiters_attitude', 'service_parking_convenience', 'service_serving_speed',
               'price_level', 'price_cost_effective', 'price_discount', 'environment_decoration', 'environment_noise',
               'environment_space', 'environment_cleaness', 'dish_portion', 'dish_taste', 'dish_look',
               'dish_recommendation',
               'others_overall_experience', 'others_willing_to_consume_again']

In [None]:
class LLoad:
    def __init__(self, filename):
        word_map = json.load(open(filename, 'r'))
        self.word2index = word_map
        self.index2word = {v: k for k, v in word_map.items()}
        self.n_words = len(word_map)

In [None]:
def encode_text(word_map, c):
    return [word_map.get(word, word_map['<unk>']) for word in c] + [word_map['<end>']]

def adjust_learning_rate(optimizer, shrink_factor):
    print("\nDECAYING learning rate.")
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * shrink_factor
    print("The new learning rate is %f\n" % (optimizer.param_groups[0]['lr'],))

def ensure_folder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)


def accuracy(scores, targets, k=1):
    batch_size = targets.size(0)
    _, ind = scores.topk(k, 1, True, True)
    correct = ind.eq(targets.view(-1, 1).expand_as(ind))
    # print('correct: ' + str(correct))
    correct_total = correct.view(-1).float().sum()  # 0D tensor
    return correct_total.item() * (100.0 / batch_size)




def timestamp():
    return datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
learning_rate = 0.002
min_word_freq = 3
batch_every = 50
print_every = 100
chunk_size = 100
num_labels = 20
num_classes = 4
start_epoch = 0
epochs = 10
hidden_size = 500
encoder_n_layers = 2
dropout = 0.05
batch_first = False
save_folder = 'models'

assert len(label_names) == 20
# Default word tokens
PAD_token = 0  # Used for padding short sentences
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def map_parse_user_reviews(split):
    if split == 'train':
        filename = 'trainingset_72'
    elif split == 'valid':
        filename = 'valiset_24'
    else:
        #filename = os.path.join(test_a_folder, test_a_filename)
        filename = 'testset_24'
    user_reviews = pd.read_csv(filename)
    return user_reviews


In [None]:
### corpus dictionary
def build_wordmap(contents):
    word_freq = Counter()

    for sentence in tqdm(contents):
        seg_list = jieba.cut(sentence.strip())
        # Update word frequency
        word_freq.update(list(seg_list))

    # Create word map
    #create frenquency ranking list
    words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    word_map = {k: v + 4 for v, k in enumerate(words)}
    word_map['<pad>'] = 0
    word_map['<start>'] = 1
    word_map['<end>'] = 2
    word_map['<unk>'] = 3
    print('len(word_map): ' + str(len(word_map)))
    print(words[:10])

    with open('WORDMAP_train72.json', 'w') as file:
        json.dump(word_map, file, indent=4)


if __name__ == '__main__':
    user_reviews = map_parse_user_reviews('train')
    build_wordmap(user_reviews['content'])

    parse_user_reviews('valid')


In [None]:
class Metrics_Aver(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


# Exponentially weighted averages
class Metrics_ExpoAver(object):
    # Exponential Weighted Average Meter
    def __init__(self, beta=0.9):
        self.reset()

    def reset(self):
        self.beta = 0.9
        self.val = 0
        self.avg = 0
        self.count = 0

    def update(self, val):
        self.val = val
        self.avg = self.beta * self.avg + (1 - self.beta) * self.val


In [None]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]


# Meaning    Positive    Neutral    Negative  Not mentioned
# Old labels    1	        0	        -1	        -2
# New labels    3           2           1           0
def map_sentimental_type(value):
    return value + 2


def parse_user_reviews(user_reviews):
    samples = []
    for i in range(len(user_reviews)):
        content = user_reviews['content'][i]
        label_tensor = np.empty((num_labels,), dtype=np.int32)
        for idx, name in enumerate(label_names):
            sentimental_type = user_reviews[name][i]
            y = map_sentimental_type(sentimental_type)

            label_tensor[idx] = y
        samples.append({'content': content, 'label_tensor': label_tensor})
    return samples


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))


# Returns padded input sequence tensor and lengths
def input_change(indexes_batch):
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths


# Returns all items for a given batch of pairs
def traindata_transfer(pair_batch):
    pair_batch.sort(key=lambda x: len(x[0]), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = input_change(input_batch)
    output = torch.LongTensor(output_batch)
    return inp, lengths, output

### dataset input setup
class InputDataset(Dataset):
    def __init__(self, split, voc):
        self.split = split
        self.voc = voc
        assert self.split in {'train', 'valid','test'}

        if split == 'train':
            filename = 'trainingset_72'
        elif split == 'valid':
            filename = 'valiset_24'
        else:
            filename = 'testset_24'

        user_reviews = pd.read_csv(filename)
        self.samples = parse_user_reviews(user_reviews)
        self.num_chunks = len(self.samples) // chunk_size

    def __getitem__(self, i):
        pair_batch = []

        for i_chunk in range(chunk_size):
            idx = i * chunk_size + i_chunk
            content = self.samples[idx]['content']
            content = content.strip()
            seg_list = jieba.cut(content)
            input_indexes = encode_text(self.voc.word2index, list(seg_list))
            label_tensor = self.samples[idx]['label_tensor']
            pair_batch.append((input_indexes, label_tensor))

        return traindata_transfer(pair_batch)

    def __len__(self):
        return self.num_chunks

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0):
        super(LSTM, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)

        # Initialize LSTM; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
        
        self.fc = nn.Linear(hidden_size, num_labels * num_classes)

    def forward(self, input_seq, input_lengths, hidden=None):
        # input_seq = [sent len, batch size]
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # embedded = [sent len, batch size, hidden size]
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.lstm(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        
        # outputs = [sent len, batch size, hidden size]
        # outputs = outputs[-1]

        # Extract the outputs for the last timestep of each example
        idx = (input_lengths - 1).view(-1, 1).expand(
            len(input_lengths), outputs.size(2))
        time_dimension = 1 if batch_first else 0
        idx = idx.unsqueeze(time_dimension)
        # Shape: (batch_size, rnn_hidden_dim)
        outputs = outputs.gather(
            time_dimension, Variable(idx)).squeeze(time_dimension)

        # outputs = [batch size, hidden size]
        outputs = self.fc(outputs)
        # outputs = [batch size, num_labels * num_classes]
        outputs = outputs.view((-1, num_classes, num_labels))
        # outputs = [batch size, num_classes, num_labels]
        outputs = F.log_softmax(outputs, dim=1)
        # outputs = [batch size, num_classes, num_labels]

        # Return output
        return outputs

In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        
        self.lstm = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
        
        self.fc = nn.Linear(hidden_size, num_labels * num_classes)

    def forward(self, input_seq, input_lengths, hidden=None):
        # input_seq = [sent len, batch size]
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # embedded = [sent len, batch size, hidden size]
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.lstm(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        
        idx = (input_lengths - 1).view(-1, 1).expand(
            len(input_lengths), outputs.size(2))
        time_dimension = 1 if batch_first else 0
        idx = idx.unsqueeze(time_dimension)
        # Shape: (batch_size, rnn_hidden_dim)
        outputs = outputs.gather(
            time_dimension, Variable(idx)).squeeze(time_dimension)

        # outputs = [batch size, hidden size]
        outputs = self.fc(outputs)
        # outputs = [batch size, num_labels * num_classes]
        outputs = outputs.view((-1, num_classes, num_labels))

        outputs = F.log_softmax(outputs, dim=1)

        return outputs

In [None]:
def ttrain(epoch,train_data, encoder,val_data, optimizer):
    loss_train_l = []
    loss_train_nowl = []
    loss_l = []
    loss_nowl = []
    batch_time = Metrics_Aver()
    train_losses = Metrics_ExpoAver()
    train_accs = Metrics_ExpoAver()

    
    
    
    start = time.time()
    for i_batch, (input_variable, lengths, target_variable) in enumerate(train_data):
        encoder.train()
        criterion =nn.CrossEntropyLoss().to(device)
        optimizer.zero_grad()
        input_variable = input_variable.to(device)
        lengths = lengths.to(device)
        target_variable = target_variable.to(device)
        outputs = encoder(input_variable, lengths)
        
        encoder.eval()
        
        
        train_loss = 0
        train_acc = 0

        for idx, _ in enumerate(label_names):
            train_loss += criterion(outputs[:, :, idx], target_variable[:, idx]) / len(label_names)
            train_acc += accuracy(outputs[:, :, idx], target_variable[:, idx]) / len(label_names)
            
        train_loss.backward()

        optimizer.step()
        
        train_losses.update(train_loss.item())
        batch_time.update(time.time() - start)
        train_accs.update(train_acc)

        start = time.time()

        
        if i_batch % batch_every ==0:
            loss_train_l.append(train_losses.avg)
            loss_train_nowl.append(train_losses.val)
            
            #validation
            random_num = int(np.random.randint(0,len(val_data),1))
            
            for i_batc, (input_variabl, lengts, target_variabl) in enumerate([val_data[random_num]]):
            # Set device options
                vali_losses = Metrics_Aver()
                vali_accs = Metrics_Aver()
                input_variabl = input_variabl.to(device)
                lengts = lengts.to(device)
                target_variabl = target_variabl.to(device)

                encoder.eval()
                valioutputs = encoder(input_variabl, lengts)

                valiloss = 0
                valiacc = 0

                for idx, _ in enumerate(label_names):
                    valiloss += criterion(valioutputs[:, :, idx], target_variabl[:, idx]) / len(label_names)
                    valiacc += accuracy(valioutputs[:, :, idx], target_variabl[:, idx]) / len(label_names)
                    
                vali_losses.update(valiloss.item())
                vali_accs.update(valiacc)
                loss_l.append(vali_losses.avg)
                loss_nowl.append(vali_losses.val)
                
                
        # Print status
        if i_batch % print_every == 0:
            print('[{0}] Epoch: [{1}][{2}/{3}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Accuracy {accs.val:.3f} ({accs.avg:.3f})'.format(timestamp(), epoch, i_batch, len(train_data),
                                                                    batch_time=batch_time,
                                                                    loss=train_losses,
                                                                    accs=train_accs))
    return loss_train_l,loss_train_nowl,loss_l,loss_nowl,vali_accs.avg,train_accs.avg

In [None]:
##training

loss_list= []
loss_nowlist= []
loss_train_list= []
loss_train_nowlist=[]
voc = LLoad('WORDMAP_train72.json')

print("voc.n_words: " + str(voc.n_words))

train_data = InputDataset('train', voc)
val_data = InputDataset('valid', voc)

# Initialize encoder
encoder = LSTM(voc.n_words, hidden_size, encoder_n_layers, dropout)

# Use appropriate device
encoder = encoder.to(device)

# Initialize optimizers
print('Building optimizers ...')
optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)

best_acc = 0
epochs_since_improvement = 0
trainaccu_list = []
valiaccu_list = []
# Epochs
for epoch in range(start_epoch, epochs):
    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if epochs_since_improvement == 20:
        break
    if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
        adjust_learning_rate(optimizer, 0.8)
    
    loss_train_li,loss_train_nowli,loss_li,loss_nowli ,val_acc,train_acc= ttrain(epoch,train_data, encoder,val_data, optimizer)

    is_best = val_acc > best_acc
    best_acc = max(best_acc, val_acc)
    loss_list.append(loss_li)
    loss_nowlist.append(loss_nowli)
    loss_train_list.append(loss_train_li)
    loss_train_nowlist.append(loss_train_nowli)
    if not is_best:
        epochs_since_improvement += 1
        print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
    else:
        epochs_since_improvement = 0;
    torch.save({'encoder':encoder,'optimizer':optimizer},'model')

    
    # Reshuffle samples
    np.random.shuffle(train_data.samples)
    np.random.shuffle(val_data.samples)
    trainaccu_list.append(train_acc)
    valiaccu_list.append(val_acc)
    
torch.save(loss_list,'atten_valloss_10epoch_batch50')
torch.save(loss_nowlist,'atten_valloss_10epoch_batch50now')
torch.save(loss_train_list,'atten_trainloss_10epoch_batch50')
torch.save(loss_train_nowlist,'atten_trainloss105epoch_batch50now')

In [None]:
plt.figure()
plt.plot([y for x in loss_list for y in x])
plt.plot([y for x in loss_train_list for y in x])

In [None]:
qwe = ['validation_loss','train_loss']
plt.figure()
plt.plot([y for x in loss_nowlist for y in x])
plt.plot([y for x in loss_train_nowlist for y in x])
plt.legend(qwe)
plt.xlabel('batch step')
plt.ylabel('loss')
plt.savefig('loss_epoch10')

In [None]:
qw=['training accuracy','validation accuracy']
plt.figure()
plt.plot(trainaccu_list)
plt.plot(valiaccu_list)
plt.legend(qw)
plt.xlabel('epoch')
plt.ylim(0,100)
plt.ylabel('accuracy')
plt.savefig('accu_epoch10')

In [None]:
#test the accuracy
voc = LLoad('WORDMAP_train72.json')
test_data = SaDataset('test',voc)
#checkpoint = torch.load('model')

encoder1 = torch.load('model')
encoder11 = encoder1['encoder']

encoder11.eval()
ac = AverageMeter()

for i_batch, (input_variable, lengths, target_variable) in enumerate(test_data):
            # Set device options
        input_variable = input_variable.to(device)
        lengths = lengths.to(device)
        target_variable = target_variable.to(device)

        outputs = encoder11(input_variable, lengths)

        loss = 0
        acc = 0

        for idx, _ in enumerate(label_names):
            #loss += criterion(outputs[:, :, idx], target_variable[:, idx]) / len(label_names)
            acc += accuracy(outputs[:, :, idx], target_variable[:, idx]) / len(label_names)

        # Keep track of metrics
        #losses.update(loss.item())
        #batch_time.update(time.time() - start)
        ac.update(acc)
print(ac.avg,'\n',ac.val )