### project 1 level 1

##### step 1 : pre-processing

In [None]:
# pre-processing
import pickle
import jieba
import numpy as np

# word segmentation
label2id = {}
def split_words(dataset):
    raw_docs = []
    docs = []
    labels = []
    for topic, datas in dataset.items():
        if not topic in label2id.keys():
            label2id[topic] = len(list(label2id))
        for data in datas:
            seg = jieba.cut(data["title"])
            raw_docs.append([topic, data["title"]])
            docs.append(" ".join(seg))
            labels.append(label2id[topic])
    return raw_docs, docs, labels

with open("dataset/train.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("dataset/valid.pkl", "rb") as f:
    valid_data = pickle.load(f)

# use baidu stopword set as stopwords
stopwords = open("stopwords.txt", "r", encoding='utf-8').readlines()
stopwords = [i.strip('\n') for i in stopwords]
    
train_raw, train_docs, train_labels = split_words(train_data)
valid_raw, valid_docs, valid_labels = split_words(valid_data)

In [None]:
# tranfer doc to sent
def doc2sent(doc):
    words = doc.split(' ')
    for word in words:
        if word in stopwords:
            words.remove(word)
    return words

train_sents = []
for doc in train_docs:
    train_sents.append(doc2sent(doc))

In [None]:
# count frequency
def sents2freq(sents):
    freq = {}
    word_num = 0
    for sent in sents:
        for word in sent:
            word_num += 1
            if word in freq.keys():
                freq[word] = freq[word] + 1
            else:
                freq[word] = 1
    return freq, word_num

word_freq, total_words = sents2freq(train_sents)

##### step 2 : embedding

In [None]:
# word embedding
# skip-gram
from gensim.models import Word2Vec
from gensim.utils import pickle as gpickle

word_model1 = Word2Vec(train_sents, sg=1)
gpickle(word_model1, 'word2vec1.pkl')

In [None]:
# word vector to sentence vector
# method 1 : Boolean weighting
def wv2sv(sent, word_modelx):
    wv_dim = 100
    sent_vec = np.zeros(wv_dim)
    words_num = 0
    for word in sent:
        if word in word_modelx.wv:
            words_num += 1
            sent_vec = sent_vec + np.asarray(word_modelx.wv[word])
    if words_num == 0:
        return np.zeros(wv_dim)
    sent_vec = sent_vec / words_num
    return sent_vec

In [None]:
# for word2vec Skipgram
from gensim.models import Word2Vec
from gensim.utils import unpickle as upickle
from gensim.utils import pickle as gpickle
import numpy as np
import pickle

word_model1 = upickle('word2vec1.pkl')

sent_vecs_w2v1 = []
for sent in train_sents:
    sent_vecs_w2v1.append(wv2sv(sent, word_model1))
    
sent_out1 = open('sent2vec1.pkl', 'wb')
pickle.dump(sent_vecs_w2v1, sent_out1)
sent_out1.close()

In [None]:
# for valid set
def wv2sv_valid(sent, word_model):
    wv_dim = 100
    sent_vec = np.zeros(wv_dim)
    words_num = 0
    for word in sent:
        if word in word_model.wv:
            words_num += 1
            sent_vec = sent_vec + np.asarray(word_model.wv[word])
    if words_num == 0:
        return np.zeros(wv_dim)
    sent_vec = sent_vec / words_num
    return sent_vec

valid_sents = []
for doc in valid_docs:
    valid_sents.append(doc2sent(doc))

In [None]:
# valid set word2vec Skipgram 
import numpy as np
import pickle

sent_vecs_w2v_valid1 = []
for sent in valid_sents:
    sent_vecs_w2v_valid1.append(wv2sv_valid(sent, word_model1))

sent_out_valid1 = open('sent2vec1v.pkl', 'wb')
pickle.dump(sent_vecs_w2v_valid1, sent_out_valid1)
sent_out_valid1.close()

In [None]:
# generate sentence vector from word2vec
# method2 : use SIF to combine word vectors
# SIF Smooth Inverse Frequency weighting scheme
from gensim.models import Word2Vec
from gensim.utils import unpickle as upickle
from gensim.utils import pickle as gpickle
import numpy as np
from sklearn.decomposition import TruncatedSVD
import pickle

def chg_weight(word):
    a = 0.0007
    if word in word_freq.keys():
        p_w = word_freq[word] / total_words
    new_weight = a / (a + p_w)
    return new_weight

def compute_svd(X, nc=1):
    svd = TruncatedSVD(n_components=nc, n_iter=10, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_svd(X, nc=1):
    svd_components = compute_svd(X, nc)
    if nc == 1:
        Y = X - X.dot(svd_components.transpose()) * svd_components
    else:
        Y = X - X.dot(svd_components.transpose()).dot(svd_components)
    return Y

def wv2sv_optim(sent, word_model):
    wv_dim = 100
    sent_vec = np.zeros(wv_dim)
    words_num = 0
    for word in sent:
        if word in word_model.wv:
            words_num += 1
            nw = chg_weight(word)
            sent_vec = sent_vec + np.asarray(word_model.wv[word]) * nw
    if words_num == 0:
        return np.zeros(wv_dim)
    sent_vec = sent_vec / words_num
    return sent_vec

def SIF(sents, word_model):
    sv_dim = 100
    X = np.zeros((len(sents), sv_dim))
    for i in range(len(sents)):
        X[i, :] = wv2sv_optim(sents[i], word_model)
    X = remove_svd(X)
    return X

word_model1 = upickle('word2vec1.pkl')

sent_vecs_optim = SIF(train_sents, word_model1)
sent_vecs_valid_optim = SIF(valid_sents, word_model1)
    
with open('sent2vec_sif1.pkl', 'wb') as f:
    pickle.dump(sent_vecs_optim, f)
with open('sent2vec_sif1v.pkl', 'wb') as f:
    pickle.dump(sent_vecs_valid_optim, f)

In [None]:
# generate sentence vector directly from doc
# doc2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import pickle as gpickle

sents_num = len(train_sents)
train_tdocs = [TaggedDocument(train_sents[i], [i]) for i in range(sents_num)]
doc_model = Doc2Vec(train_tdocs, window=5)
gpickle(doc_model, 'doc2vec.pkl')

In [None]:
# for doc2vec
# sent vecs for train & valid
import numpy as np
import pickle

def dv2sv(sent, doc_model):
    return doc_model.infer_vector(sent)

sent_vecs_d2v = []
for sent in train_sents:
    sent_vecs_d2v.append(doc_model.infer_vector(sent))
    
sent_out2 = open('doc2vec_train.pkl', 'wb')
pickle.dump(sent_vecs_d2v, sent_out2)
sent_out2.close()
    
sent_vecs_d2v_valid = []
for sent in valid_sents:
    sent_vecs_d2v_valid.append(doc_model.infer_vector(sent))

sent_out_valid2 = open('doc2vec_valid.pkl', 'wb')
pickle.dump(sent_vecs_d2v_valid, sent_out_valid2)
sent_out_valid2.close()

##### step 3 : classification

In [None]:
# classify by naive bayes / Logistic Regression / SVM
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

def predict_label_scale(sent_vecs, sent_vecs_valid, train_labels, train_model):
    
    trainx = np.asarray(sent_vecs)
    validx = np.asarray(sent_vecs_valid)
    scalert = MinMaxScaler()
    train_x = scalert.fit_transform(trainx)
    scalerv = MinMaxScaler()
    valid_x = scalerv.fit_transform(validx)
    model_nb = train_model()
    model_nb.fit(train_x, train_labels)
    pred = model_nb.predict(valid_x)
    return pred

def predict_label(sent_vecs, sent_vecs_valid, train_labels, train_model):
    train_x = np.asarray(sent_vecs)
    valid_x = np.asarray(sent_vecs_valid)
    model_nb = train_model()
    model_nb.fit(train_x, train_labels)
    pred = model_nb.predict(valid_x)
    return pred

In [None]:
# for word2vec Skipgram Booleanw
import pickle

with open('sent2vec1.pkl', 'rb') as f:
    sent_vecs_w2v1 = pickle.load(f)
with open('sent2vec1v.pkl', 'rb') as f:
    sent_vecs_w2v_valid1 = pickle.load(f)
    
pred_w2v1 = predict_label(sent_vecs_w2v1, sent_vecs_w2v_valid1, train_labels, LR)
print('acc = %.4f' % (sum(pred_w2v1 == valid_labels) / len(valid_labels)))

In [None]:
# for word2vec Skipgram SIF
pred_w2v2 = predict_label(sent_vecs_optim, sent_vecs_valid_optim, train_labels, LR)
print('acc = %.4f' % (sum(pred_w2v2 == valid_labels) / len(valid_labels)))

In [None]:
# for doc2vec
import pickle

with open('doc2vec_train.pkl', 'rb') as f:
    sent_vecs_d2v = pickle.load(f)
with open('doc2vec_valid.pkl', 'rb') as f:
    sent_vecs_d2v_valid = pickle.load(f)
    
pred_d2v = predict_label(sent_vecs_d2v, sent_vecs_d2v_valid, train_labels, LR)
print('acc = %.4f' % (sum(pred_d2v == valid_labels) / len(valid_labels)))

##### step 4 : similarity

In [None]:
# word mover distance
# use wmdistance defined in gensim
def Find_wmd(query_sent, word_model):
    query_seg = doc2sent(query_sent)
    score = np.zeros(train_x.shape[0])
    for i in range(len(train_sents)):
        score[i] = word_model.wmdistance(query_sent, train_sent[i])
    ids = list(range(train_x.shape[0]))
    ids.sort(key = lambda x: score[x])
    for i in ids[20]:
        print(train_raw[i][0], train_raw[i][1])

In [None]:
# samples
Find_wmd('王者荣耀国际版入选东南亚运动会电竞项目', word_model1)