### project 1 level 1

##### step 1 : BoW

In [None]:
import pickle
import jieba
import numpy as np

label2id = {}
def split_words(dataset):
    raw_docs = []
    docs = []
    labels = []
    for topic, datas in dataset.items():
        if not topic in label2id.keys():
            label2id[topic] = len(list(label2id))
        for data in datas:
            seg = jieba.cut(data["title"])
            raw_docs.append([topic, data["title"]])
            docs.append(" ".join(seg))
            labels.append(label2id[topic])
    return raw_docs, docs, labels

with open("dataset/train.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("dataset/valid.pkl", "rb") as f:
    valid_data = pickle.load(f)

stopwords = open("stopwords.txt", "r", encoding='utf-8').readlines()
stopwords = [i.strip('\n') for i in stopwords]
    
train_raw, train_docs, train_labels = split_words(train_data)
valid_raw, valid_docs, valid_labels = split_words(valid_data)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import pandas as pd

bowModel = CountVectorizer(stop_words=stopwords, max_features=1000).fit(train_docs)

train_x = bowModel.transform(train_docs)
valid_x = bowModel.transform(valid_docs)

model = MultinomialNB()
model.fit(train_x, train_labels)

prediction = model.predict(valid_x)
print('acc = %.4f' % (sum(prediction == valid_labels) / len(valid_labels)))

##### step 2 : extract 2-word tokens

In [None]:
# simple process to reduce search space

# tranfer doc to sent
def doc2sent(doc):
    words = doc.split(' ')
    for word in words:
        if word in stopwords:
            words.remove(word)
    return words

train_sents = []
for doc in train_docs:
    train_sents.append(doc2sent(doc))

# count frequency
def sents2freq(sents):
    freq = {}
    word_num = 0
    for sent in sents:
        for word in sent:
            word_num += 1
            if word in freq.keys():
                freq[word] = freq[word] + 1
            else:
                freq[word] = 1
    return freq, word_num

word_freq, total_words = sents2freq(train_sents)

In [None]:
# filter by freq
words_hf = []
for word in word_freq.keys():
    if word_freq[word] >= 40 and word != '' and word != '，' and word != '？':
        words_hf.append(word)

def extract_token(sents, window=2):
    words_num = len(words_hf)
    token2 = np.zeros((words_num, words_num))
    for sent in sents:
        prev_word = -1
        for word in sent:
            if word in words_hf:
                temp_word = words_hf.index(word)
            else:
                temp_word = -1
            if prev_word != -1 and temp_word != -1:
                token2[prev_word, temp_word] += 1
            prev_word = temp_word
    token2_feature = []
    for i in range(words_num):
        for j in range(words_num):
            if token2[i, j] >= 60:
                token2_feature.append((words_hf[i], words_hf[j]))
    return token2_feature

token2_feature = extract_token(train_sents)

In [None]:
def add_feature(mat):
    newf_num = len(token2_feature)
    sents_num = mat.shape[0]
    feats_num = mat.shape[1] + newf_num
    maty = np.zeros((sents_num, feats_num))
    for i in range(sents_num):
        for j in range(mat.shape[1]):
            maty[i, j] = mat[i, j]
        prev_word = None
        for word in train_sents:
            if prev_word != None:
                if (prev_word, word) in token2_feature:
                    nf = token2_feature.index((prev_word, word))
                    maty[i, mat.shape[1] + nf] += 1
            prev_word = word
    return maty

##### step 3 : prediction

In [None]:
train_y = add_feature(train_x)
valid_y = add_feature(valid_x)

model1 = MultinomialNB()
model1.fit(train_y, train_labels)

prediction1 = model1.predict(valid_y)
print('acc = %.4f' % (sum(prediction1 == valid_labels) / len(valid_labels)))

In [None]:
def Find(query_str, train_set):
    seg = jieba.cut(query_str)
    vec = bowModel.transform([" ".join(seg)])
    score = np.zeros(train_set.shape[0])
    for i in range(train_set.shape[0]):
        diff = np.array(vec) - np.array(train_set[i])
        score[i] = abs(diff).sum()
    ids = list(range(train_set.shape[0]))
    ids.sort(key=lambda x:score[x])
    for i in ids[:20]:
        print(train_raw[i][0], train_raw[i][1])

In [None]:
# samples
Find("王者荣耀国际版入选东南亚运动会电竞项目", train_x)
Find("王者荣耀国际版入选东南亚运动会电竞项目", train_y)