# 分类器实现

- 准备工作：载入需要的库

In [1]:
import pickle
from pandas.core.frame import DataFrame

import sqlite3
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

import numpy as np
import pandas as pd

- 准备工作：定义在训练中用到的且将要复用的函数

In [3]:
def split_spaces(data): ## 输入未处理的数据，第二列为文本，返回分割好第二列的列表。
    data_list = []
    for line in data:
        data_list.append(list(line))
        
    for item in data_list:
        item[1] = ' '.join(list(item[1]))
    return data_list

def desymbol(data):
    import re
    data_desymbol = []
    for line in data:
        data_desymbol.append(list(line))
    for row in data_desymbol:
        row[1] = ' '.join(list(re.sub("[\s+\.\!\/_,$%^*)(+\"\']+|[+——！，。？：、~@#￥%……&*（）“”]", "",row[1])))
    return data_desymbol

def add_start_end(data):
    data_with_se = []
    for line in data:
        data_with_se.append(list(line))
    for row in data_with_se:
        row[1] = "<start> "+row[1]+" <end>"
    return data_with_se

def countV(x_test):
    xtest_cv = cv.transform(x_test)
    return xtest_cv

def tfIdfV(x_test):
    x_test_tfidf = tv.transform(x_test)
    
    return x_test_tfidf

## Ⅰ 数据读入与预处理

In [4]:
def load_from_db():
    conn = sqlite3.connect("../data/database/texts.db")
    print ('Opened database successfully')
    c = conn.cursor()

    ci_data=[]
    cursor = c.execute("SELECT * from ci order by id desc limit 10")
    for row in cursor:
        ci_data.append(row)

    poet_data = []
    cursor=c.execute("SELECT * FROM poet order by id desc limit 10")
    for row in cursor:
        poet_data.append(row)

    classical_data = []
    cursor=c.execute("SELECT * FROM classical order by id desc limit 10")
    for row in cursor:
        classical_data.append(row)

    journal_data = []
    cursor=c.execute("SELECT * FROM journal order by id desc limit 10")
    for row in cursor:
        journal_data.append(row)
    
    news_data = []
    cursor=c.execute("SELECT * FROM news order by id asc limit 10")
    for row in cursor:
        news_data.append(row)

    print("所有类型数据读取成功，各10条。")
    data = ci_data + poet_data + classical_data + journal_data + news_data
    
    return add_start_end(desymbol(split_spaces(data)))

def load_from_file(addr):
    with open(addr,mode='r',encoding='UTF-8') as file:
        content = ''
        for line in file:
            content += str(line)
        file.close()
    print("通过文件读入的内容为：\n"+content)
    data = ['1',content,'unknown']
    return data

def load_from_input(text):
    data = ['1',text,'unknown']
    return data
    

## Ⅱ 模型文件载入

In [5]:
file = open("../model/NB-CV.pickle", "rb")
model_NBCV = pickle.load(file)
file.close()

file = open("../model/NB-TFIDF.pickle", "rb")
model_NBTFIDF = pickle.load(file)
file.close()

file = open("../model/LR-CV.pickle", "rb")
model_LRCV = pickle.load(file)
file.close()

file = open("../model/LR-TFIDF.pickle", "rb")
model_LRTFIDF = pickle.load(file)
file.close()

file = open("../model/CV.pkl", "rb")
cv = pickle.load(file)
file.close()

file = open("../model/tv.pkl", "rb")
tv = pickle.load(file)
file.close()

## Ⅲ 模型预测

In [6]:
def NBCV(textlist):
    s = model_NBCV.predict(countV(textlist))
    return s

def NBTFIDF(textlist):
    s = model_NBTFIDF.predict(tfIdfV(textlist))
    return s

def LRCV(textlist):
    s = model_LRCV.predict(countV(textlist))
    return s

def LRTFIDF(textlist):
    s = model_LRTFIDF.predict(tfIdfV(textlist))
    return s


## Ⅳ 实操与结果输出

In [7]:
def get_text(data):
    textlist = []
    if(len(np.array(data).shape) > 1):
        for row in data:
            textlist.append(row[1])
    else: textlist.append(data[1])
    return textlist

数据库读入测试：

In [8]:
textlist = get_text(load_from_db())
c = {"text":textlist,"type_NBCV":list(NBCV(textlist)),"type_NBCVTFIDF":list(NBTFIDF(textlist)),"type_LRCV":list(LRCV(textlist)),"type_LRTFIDF":list(LRTFIDF(textlist))}
df = pd.DataFrame(c)
display(df)

Opened database successfully
所有类型数据读取成功，各2000条。


Unnamed: 0,text,type_NBCV,type_NBCVTFIDF,type_LRCV,type_LRTFIDF
0,<start> 乍 卷 珠 帘 新 燕 入 <end>,ci,ci,poet,poet
1,<start> 楼 台 里 春 风 淡 荡 <end>,ci,poet,poet,poet
2,<start> 剪 新 幡 儿 斜 插 真 珠 髻 <end>,ci,poet,poet,poet
3,<start> 露 着 桂 枝 晓 霜 护 菊 篱 秋 无 尘 玉 宇 南 极 一 点 瑞 ...,ci,ci,ci,ci
4,<start> 十 日 後 重 阳 甘 菊 阶 前 满 意 黄 生 日 无 钱 留 贺 客 ...,ci,ci,ci,ci
5,<start> 龙 角 辉 春 蛾 春 惊 晓 梦 阑 金 翠 屏 开 异 芬 薰 室 风 ...,ci,ci,ci,ci
6,<start> 薰 梅 染 柳 借 得 东 君 手 柳 色 梅 香 到 樽 前 摅 写 才 ...,ci,ci,ci,ci
7,<start> 雪 坞 霜 林 一 夜 报 春 归 消 息 看 是 处 春 回 柳 眼 粉 ...,ci,ci,ci,ci
8,<start> 春 玉 苍 山 屏 星 暖 佳 辰 难 得 看 柳 眼 梅 金 全 似 海 ...,ci,ci,ci,ci
9,<start> 金 关 五 云 里 玉 座 太 微 间 凌 虚 新 就 燕 间 宣 唤 侍 ...,ci,ci,ci,ci


输入文本测试：

In [9]:
textlist = get_text(load_from_input("湖滨七舍群作为武汉大学湖滨七舍学子共同创建的宿舍交流群，致力于为学子提供一个便捷、良好的交流平台。同学们可以在群内进行学术交流、问题反映、发布通知等。同时，也希望同学们注意措辞，合理合法地参与公共事务。"))
c = {"text":textlist,"type_NBCV":list(NBCV(textlist)),"type_NBCVTFIDF":list(NBTFIDF(textlist)),"type_LRCV":list(LRCV(textlist)),"type_LRTFIDF":list(LRTFIDF(textlist))}
df = pd.DataFrame(c)
display(df)

Unnamed: 0,text,type_NBCV,type_NBCVTFIDF,type_LRCV,type_LRTFIDF
0,湖滨七舍群作为武汉大学湖滨七舍学子共同创建的宿舍交流群，致力于为学子提供一个便捷、良好的交流...,poet,poet,classical,classical


尴尬，全错