In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [152]:
import nltk
nltk.download('punkt')
import numpy as np
from collections import Counter
import json
import time
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torch.autograd import Variable
from torchvision import transforms

torch.manual_seed(2022)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x7f9977b18c10>

In [153]:
import os
os.chdir('/content/drive/MyDrive/lab5')

In [154]:
with open('train.json','r') as f:
    train_data = json.load(f)
with open('val.json','r') as f:
    val_data = json.load(f)
with open('test.json','r') as f:
    test_data = json.load(f)

In [155]:
def process_list(data,flag):
  list = []
  if flag == 1:
    text_or_label = 'text'
  else:
    text_or_label = 'label'
  for item in data:
    list.append(item[text_or_label])
  return list

In [156]:
train_text_list = process_list(train_data,1)
val_text_list = process_list(val_data,1)
train_labels = process_list(train_data,0)
val_labels = process_list(val_data,0)

In [157]:
words = Counter() # 用于统计每个单词出现的次数
i=0
for text in train_text_list:
    words_list = nltk.word_tokenize(text) # 将句子进行分词
    words.update(words_list)  # 更新词频列表
    train_text_list[i] = words_list
    i+=1

In [158]:
words = {k:v for k,v in words.items() if v>1}
words = sorted(words, key=words.get,reverse=True)
words = ['_PAD'] + words
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

In [159]:
i=0
for text in train_text_list:  
    train_text_list[i] = [word2idx[word] if word in word2idx else 0 for word in text]
    i+=1

In [160]:
i=0
for text in val_text_list:
    val_text_list[i] = [word2idx[word] if word in word2idx else 0 for word in nltk.word_tokenize(text)]
    i+=1

In [161]:
def pad_input(text_list, seq_len):
    """
    将句子长度固定为`seq_len`，超出长度的从后面阶段，长度不足的在前面补0
    """
    features = np.zeros((len(text_list), seq_len),dtype=int)
    i=0
    for text in text_list:
        features[i, -len(text):] = np.array(text)[:seq_len]
        i+=1
    return features


In [162]:
train_text = pad_input(train_text_list, 200)
val_text = pad_input(val_text_list, 200)

In [163]:
def labels2idx(data_labels):
  i=0
  for label in data_labels:
    if(data_labels[i]=='negative'):
      data_labels[i]=0
    elif(data_labels[i]=='positive'):
        data_labels[i]=1
    elif(data_labels[i]=='neutral'):
        data_labels[i]=2
    i+=1
  return data_labels

In [164]:
train_labels = np.array(labels2idx(train_labels))
val_labels = np.array(labels2idx(val_labels))

In [165]:
batch_size = 200

train_Data = TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_labels))
val_Data = TensorDataset(torch.from_numpy(val_text), torch.from_numpy(val_labels))

train_loader = DataLoader(train_Data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_Data, shuffle=True, batch_size=batch_size)


In [166]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [167]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        return F.max_pool1d(x, kernel_size=x.shape[2]) # kenerl_size=seq_len

class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()

        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                          out_channels = c, 
                          kernel_size = k))
            
        self.decoder = nn.Linear(sum(num_channels), 3)
        self.dropout = nn.Dropout(0.5)

    def forward(self, inputs):
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2)
        embeddings = embeddings.permute(0, 2, 1)
        encoding = torch.cat([
            self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

embed_size, kernel_sizes, nums_channels = 200, [2, 3, 4], [100, 100, 100]
net = TextCNN(words, embed_size, kernel_sizes, nums_channels)

In [168]:
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

In [169]:
lr = 0.001
num_epochs = 10
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

In [170]:
train(train_loader, val_loader, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.0928, train acc 0.530, test acc 0.582, time 0.8 sec
epoch 2, loss 0.4094, train acc 0.652, test acc 0.604, time 0.7 sec
epoch 3, loss 0.2284, train acc 0.718, test acc 0.612, time 0.7 sec
epoch 4, loss 0.1489, train acc 0.768, test acc 0.610, time 0.7 sec
epoch 5, loss 0.1007, train acc 0.816, test acc 0.611, time 0.7 sec
epoch 6, loss 0.0710, train acc 0.860, test acc 0.622, time 0.7 sec
epoch 7, loss 0.0513, train acc 0.888, test acc 0.604, time 0.7 sec
epoch 8, loss 0.0386, train acc 0.906, test acc 0.618, time 0.7 sec
epoch 9, loss 0.0292, train acc 0.926, test acc 0.619, time 0.7 sec
epoch 10, loss 0.0230, train acc 0.936, test acc 0.615, time 0.7 sec


In [171]:
def predict_sentiment(net, vocab, sentence):
    device = list(net.parameters())[0].device
    sentence = torch.tensor([word2idx[word] if word in word2idx else 0 for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    if(label.item()==0):
      return 'negative'
    elif(label.item()==1):
      return 'positive'
    else:
      return 'neutral'


In [172]:
class My_Dataset(Dataset):
    def __init__(self,main_dir):
        self.dataset=[]
        i=0
        for data in main_dir:
          self.dataset.append([data['img'],data['label'],i])
          i+=1

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        img,label,i=self.dataset[index]
        img_data=self.data_process(Image.open(img))
        if(label=='negative'):
          label = 0
        elif(label=='positive'):
          label = 1
        elif(label=='neutral'):
          label = 2
        elif(label is None):
          label = -1
        return img_data,label

    def data_process(self,x):
        return transforms.Compose(
        [
            transforms.Resize((256,256)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.5,0.5,0.5],
                std=[0.5,0.5,0.5],
            ),
        ]
    )(x)

In [173]:
train_loader = DataLoader(dataset=My_Dataset(train_data), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=My_Dataset(val_data), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=My_Dataset(test_data), batch_size=batch_size)

In [174]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=3,   
                out_channels=16, 
                kernel_size=5,  
                stride=1, 
                padding=2, 
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,   
                out_channels=32,
                kernel_size=5,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4) 
        )
        self.output = nn.Linear(in_features=32*16*16, out_features=3)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)        
        x = x.view(x.size(0), -1)
        output = self.output(x)
        return output

In [175]:
cnn = CNN()
print(cnn)
optimizer = torch.optim.Adam(cnn.parameters(), lr=lr)

CNN(
  (conv1): Sequential(
    (0): Conv2d(3, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (output): Linear(in_features=8192, out_features=3, bias=True)
)


In [176]:
train(train_loader, val_loader, cnn, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.9592, train acc 0.565, test acc 0.596, time 39.7 sec
epoch 2, loss 0.4530, train acc 0.597, test acc 0.596, time 39.1 sec
epoch 3, loss 0.2980, train acc 0.597, test acc 0.596, time 39.7 sec
epoch 4, loss 0.2210, train acc 0.598, test acc 0.595, time 40.3 sec
epoch 5, loss 0.1747, train acc 0.598, test acc 0.592, time 38.7 sec
epoch 6, loss 0.1444, train acc 0.600, test acc 0.597, time 38.9 sec
epoch 7, loss 0.1216, train acc 0.602, test acc 0.595, time 39.9 sec
epoch 8, loss 0.1047, train acc 0.612, test acc 0.593, time 38.9 sec
epoch 9, loss 0.0912, train acc 0.621, test acc 0.562, time 39.1 sec
epoch 10, loss 0.0789, train acc 0.633, test acc 0.562, time 40.0 sec


In [177]:
predict=[]
with torch.no_grad():
    for X, y in test_loader:
        if isinstance(cnn, torch.nn.Module):
            predict.append(cnn(X.to(device)).argmax(dim=1))

In [178]:
predict_pic=[]
for batch in predict:
  for i in batch:
    if(i==0):
      predict_pic.append('negative')
    elif(i==1):
      predict_pic.append('positive')
    elif(i==2):
      predict_pic.append('neutral')


In [179]:
predict_text=[]
for i in range(len(test_data)):
  if(len(nltk.word_tokenize(test_data[i]['text']))>=4):
    predict_text.append(predict_sentiment(net, words, nltk.word_tokenize(test_data[i]['text'])))
  else:
    predict_text.append('neutral')

In [187]:
predict_final=[]
for i in range(len(test_data)):
  if(predict_text[i]=='positive' and predict_pic[i]=='positive'):
    predict_final.append('positive')
  elif(predict_text[i]=='positive' and predict_pic[i]=='neutral'):
    predict_final.append('positive')
  elif(predict_text[i]=='positive' and predict_pic[i]=='negative'):
    predict_final.append('positive')
  elif(predict_text[i]=='negative' and predict_pic[i]=='positive'):
    predict_final.append('negative')
  elif(predict_text[i]=='negative' and predict_pic[i]=='neutral'):
    predict_final.append('negative')
  elif(predict_text[i]=='negative' and predict_pic[i]=='negative'):
    predict_final.append('negative')
  elif(predict_text[i]=='neutral' and predict_pic[i]=='positive'):
    predict_final.append('neutral')
  elif(predict_text[i]=='neutral' and predict_pic[i]=='negative'):
    predict_final.append('negative')
  else:
    predict_final.append('neutral')

In [198]:
import pandas as pd
test_data_file = pd.read_csv("test_without_label.txt")['guid'].values

In [207]:
with open('test_without_label.txt','w') as f:
  f.write('guid,tag\n')
  for i in range(len(test_data_file)):
    f.write(str(test_data_file[i])+','+str(predict_final[i])+'\n')
f.close()