In [1]:
import pandas as pd
import re

In [2]:
sen_vec_dimension = 200

In [3]:
df = pd.read_csv('tweet_train.csv')
df['label'] = df['label']-1
df.to_csv('tweet_train_new.csv')

In [4]:
'''
hbx
'''

'''
get_text_and_label：
    清理text数据
    将清理后的text保存到tweets_clean.csv中
    返回单独的text和label
'''
'''
get_voc_and_token_of_sentences
    得到token vocabulary和每个句子的token
    去除包含有"…"的token
'''

def get_text_and_label(csvfile):
    tweets_df = pd.read_csv(csvfile)
    # 去除url
    tweets_clean = tweets_df['text'].map(lambda x: re.sub('https://[a-zA-Z0-9.?/&=:]*', ' ',x))
    # 去除@开头的
    tweets_clean = tweets_clean.map(lambda x: re.sub('@[a-zA-Z0-9.?/&=:]*', ' ',x))
    # 去除tag、标点符号
    tweets_clean = tweets_clean.map(lambda x: re.sub('[,\\.!?#:]', ' ', x))
    # 去除省略号
    # tweets_clean = tweets_clean.map(lambda x: re.sub('[a-z]*', ' ', x))
    # Remove unnecessary line breaks
    tweets_clean = tweets_clean.map(lambda x: re.sub(r"\n", '', x))
    # Convert the titles to lowercase
    tweets_clean = tweets_clean.map(lambda x: x.lower())
    # 保存到csv
    tweets_clean.to_csv('tweets_clean.csv')
    text = pd.read_csv('tweets_clean.csv')['text']
    label = pd.read_csv('tweet_train.csv')['label']
    return text,label

def get_text_for_testing(csvfile):
    tweets_df = pd.read_csv(csvfile)
    # 去除url
    tweets_clean = tweets_df['text'].map(lambda x: re.sub('https://[a-zA-Z0-9.?/&=:]*', ' ',x))
    # 去除@开头的
    tweets_clean = tweets_clean.map(lambda x: re.sub('@[a-zA-Z0-9.?/&=:]*', ' ',x))
    # 去除tag、标点符号
    tweets_clean = tweets_clean.map(lambda x: re.sub('[,\\.!?#:]', ' ', x))
    # 去除省略号
    # tweets_clean = tweets_clean.map(lambda x: re.sub('[a-z]*', ' ', x))
    # Remove unnecessary line breaks
    tweets_clean = tweets_clean.map(lambda x: re.sub(r"\n", '', x))
    # Convert the titles to lowercase
    tweets_clean = tweets_clean.map(lambda x: x.lower())
    # 保存到csv
    tweets_clean.to_csv('tweets_clean.csv')
    text = pd.read_csv('tweets_clean.csv')['text']
    return text
    
    

def get_voc_and_token_of_sentences(text):
    token_voc = []
    token_of_sentences = []
    for sent in text:
        token_of_sentences.append(sent.split())
        for word in sent.split():
            token_voc.append(word)

    token_voc = list(set(token_voc))
    token_voc = [x for x in token_voc if '…' not in x]
    for i in range(len(token_of_sentences)):
        for word in token_of_sentences[i]:
            if "…" in word:
                token_of_sentences[i].remove(word)
    return token_voc, token_of_sentences

In [5]:
text,label = get_text_and_label('tweet_train.csv')
output_text = get_text_for_testing('tweet_test.csv')

In [6]:
from sklearn.model_selection import train_test_split

sens_,sens_test,y_,y_test = train_test_split(text,label,test_size=0.1, random_state=0) 
sens_train,sens_val,y_train,y_val = train_test_split(sens_,y_,test_size=1/9, random_state=0)

vocalbulary,tos_train = get_voc_and_token_of_sentences(sens_train)
_,tos_val = get_voc_and_token_of_sentences(sens_val)
_,tos_test = get_voc_and_token_of_sentences(sens_test)

_,tos_output = get_voc_and_token_of_sentences(output_text)

In [7]:
import numpy as np
import torch

def randomly_initialised_vectors(tokens=None):
    wordToIx = {}
    wordToIx['UNK'] = 0
    i = 1
    for token in tokens:
        if token not in wordToIx.keys():
            wordToIx[token] = i
            i = i+1
    word_vectors = []
    for _ in wordToIx:
        word_vectors.append(np.random.random(sen_vec_dimension))
    word_vectors = np.array(word_vectors)
    return word_vectors,wordToIx

def make_bow_vector(tokens,wordToIdx,wordVec):
    vec = np.zeros(sen_vec_dimension)
    count = 0
    for word in tokens:
        if word in wordToIdx.keys():
            vector = wordVec[wordToIdx[word]]
            vec += vector
            count+=1
    vec = vec / count
    vec = torch.from_numpy(vec)
    return vec.view(1, -1)

def multi_sentences_to_vectors(sentences,wordToIdx,wordVec):
    myList = []
    for tokens in sentences:
        myList.append(make_bow_vector(tokens,wordToIdx,wordVec))
    return myList

In [8]:
word_vectors,word_to_idx = randomly_initialised_vectors(vocalbulary)

sen_vec_train = multi_sentences_to_vectors(tos_train,word_to_idx,word_vectors)
sen_vec_val = multi_sentences_to_vectors(tos_val,word_to_idx,word_vectors)
sen_vec_test = multi_sentences_to_vectors(tos_test,word_to_idx,word_vectors)
# use to show
sen_vec_output = multi_sentences_to_vectors(tos_output,word_to_idx,word_vectors)

In [16]:
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.functional import F

class QuestionClassifier(nn.Module):
    def __init__(self, num_labels):
        super(QuestionClassifier, self).__init__()
        n_hidden = 1000
        self.f1 = nn.Linear(sen_vec_dimension, num_labels)
#         self.f2 = nn.Linear(n_hidden, num_labels)

        self.double()
        # loss
        self.loss_function = nn.CrossEntropyLoss()
        # optimizer
        self.optimizer = optim.SGD(self.parameters(), lr=0.1)

        self.test_vecs = []
        self.test_label = []
        self.label_to_ix = {}

    def forward(self, input):
        out = self.f1(input)
#         out = F.relu(out)
#         out = self.f2(out)
#         out = F.softmax(out,dim=1)
        return out

    def train_model(self,sentence_vectors,labels):
        for i in range(0, len(sentence_vectors)):
            vector = sentence_vectors[i]
            lab = labels.iloc[i]
            self.zero_grad()
            bow_vec = Variable(vector)
            target = Variable(torch.LongTensor([lab]))

            output = self(bow_vec)
            loss = self.loss_function(output, target)
            loss.backward()
            self.optimizer.step()

    def test_model(self, test_sentence_vectors, test_labels):
        # calculate correct rate
        data_size = len(test_sentence_vectors)
        correct_num = 0
        for i in range(len(test_labels)):
            bow_vec = Variable(test_sentence_vectors[i])
            label = test_labels.iloc[i]
            output = self(bow_vec)

            pre_max_poss, index = torch.max(output, 1)
            if label == int(index):
                correct_num += 1

        return round(correct_num / data_size,4)
    
    def predict(self,test_sentence_vectors):
        predict_list = []
        for i in range(len(test_sentence_vectors)):
            bow_vec = Variable(test_sentence_vectors[i])
            output = self(bow_vec)

            pre_max_poss, index = torch.max(output, 1)
            predict_list.append(int(index))
        return predict_list
        
        

In [20]:
output_size = len(set(y_train))
model = QuestionClassifier(output_size)
for epoch in range(20):
    model.train_model(sen_vec_train,y_train)
    # validate the model
    acc = model.test_model(sen_vec_val,y_val)
    print('epoch:', epoch, ' dev_acc: ', acc)

epoch: 0  dev_acc:  0.3
epoch: 1  dev_acc:  0.34
epoch: 2  dev_acc:  0.38
epoch: 3  dev_acc:  0.4
epoch: 4  dev_acc:  0.43
epoch: 5  dev_acc:  0.45
epoch: 6  dev_acc:  0.45
epoch: 7  dev_acc:  0.44
epoch: 8  dev_acc:  0.43
epoch: 9  dev_acc:  0.44
epoch: 10  dev_acc:  0.45
epoch: 11  dev_acc:  0.45
epoch: 12  dev_acc:  0.44
epoch: 13  dev_acc:  0.42
epoch: 14  dev_acc:  0.43
epoch: 15  dev_acc:  0.43
epoch: 16  dev_acc:  0.44
epoch: 17  dev_acc:  0.44
epoch: 18  dev_acc:  0.46
epoch: 19  dev_acc:  0.47


In [21]:
acc = model.test_model(sen_vec_test,y_test)
print('test_acc: ', acc)

test_acc:  0.59


In [22]:
print(model.predict(sen_vec_output))

[3, 1, 2, 1, 4, 2, 2, 2, 2, 2, 4, 1, 0, 4, 1, 2, 3, 2, 1, 0, 2, 2, 1, 1, 1, 2, 0, 1, 4, 2, 1, 1, 1, 1, 3, 3, 3, 4, 1, 4, 2, 1, 0, 1, 2, 2, 0, 1, 4, 1, 2, 4, 1, 4, 1, 2, 2, 4, 1, 1, 3, 2, 4, 4, 1, 3, 0, 2, 4, 4, 0, 2, 1, 1, 3, 1, 2, 1, 1, 3, 2, 1, 0, 1, 1, 4, 1, 1, 2, 3, 1, 0, 1, 1, 3, 4, 1, 2, 1, 2, 2, 2, 1, 1, 4, 4, 4, 3, 2, 4, 1, 4, 0, 3, 2, 1, 1, 1, 1, 2, 3, 4, 4, 1, 2, 0, 1, 4, 1, 2, 3, 2, 2, 1, 1, 4, 1, 2, 0, 2, 1, 0, 2, 1, 4, 1, 4, 4, 3, 2, 0, 3, 4, 2, 1, 1, 4, 0, 1, 1, 1, 1, 4, 2, 4, 2, 4, 2, 1, 1, 2, 0, 3, 2, 4, 4, 1, 1, 4, 4, 1, 1, 2, 4, 1, 2, 4, 0, 4, 1, 2, 1, 4, 1, 1, 2, 2, 0, 4, 1, 1, 1, 4, 1, 2, 1, 1, 2, 1, 3, 1, 1, 1, 4, 2, 2, 4, 2, 1, 4, 2, 4, 1, 3, 3, 4, 2, 1, 1, 3, 1, 1, 2, 2, 0, 0, 4, 1, 0, 2, 1, 1, 1, 1, 2, 4, 2, 4, 1, 0, 1, 1, 1, 1, 1, 1, 3, 4, 3, 1, 1, 4, 1, 4, 1, 1, 1, 4, 0, 4, 1, 2, 2, 1, 1, 1, 1, 1, 1, 4, 2, 1, 1, 0, 4, 2, 1, 1, 4, 4, 2, 4, 3, 4, 0, 1, 3, 1, 1, 1, 4, 1, 1, 2, 3, 1, 2, 1, 2, 3, 4, 1, 1, 1, 3, 1, 0, 3, 2, 1, 2, 1, 1, 1, 2, 2, 4, 1, 1, 2, 4, 1, 2, 