In [26]:
import pandas as pd # 读取文件
import numpy as np # 数值计算
import nagisa # 日文分词
from sklearn.feature_extraction.text import TfidfVectorizer # 文本特征提取
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.pipeline import make_pipeline # 组合流水线

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.


In [9]:
import pandas as pd

train_cn = pd.read_excel('data/汽车领域多语种迁移学习挑战赛初赛训练集/中文_trian.xlsx')
train_ja = pd.read_excel('data/汽车领域多语种迁移学习挑战赛初赛训练集/日语_train.xlsx')
train_en = pd.read_excel('data/汽车领域多语种迁移学习挑战赛初赛训练集/英文_train.xlsx')
test_ja = pd.read_excel('data/testA.xlsx', sheet_name='日语_testA')
test_en = pd.read_excel('data/testA.xlsx', sheet_name='英文_testA')

In [21]:
# 查看每列字段类型
train_obj = train_en.columns
print(train_obj)
for col in train_obj:
    print(col+' : '+str(type(train_en[col][0])))

Index(['原始文本', '中文翻译', '意图', '槽值1', '槽值2'], dtype='object')
原始文本 : <class 'str'>
中文翻译 : <class 'str'>
意图 : <class 'str'>
槽值1 : <class 'float'>
槽值2 : <class 'float'>


In [18]:
# 查看每列字段类型
test_obj = test_en.columns
print(test_obj)
for col in test_obj:
    print(col+' : '+str(type(test_en[col][0])))

Index(['原始文本'], dtype='object')
原始文本 : <class 'str'>


In [22]:
import jieba
import jieba.posseg as pseg

words = jieba.lcut("阿水是一个好同志。")
print(words)
# ['阿水', '是', '一个', '好', '同志', '。']

words = pseg.lcut("阿水是一个好同志。")
# [pair('阿水', 'nr'), pair('是', 'v'), pair('一个', 'm'), pair('好', 'a'), pair('同志', 'n'), pair('。', 'x')]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.869 seconds.
Prefix dict has been built successfully.


['阿水', '是', '一个', '好', '同志', '。']


In [28]:
# 文本分词
train_ja['words'] = train_ja['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
train_en['words'] = train_en['原始文本'].apply(lambda x: x.lower())

test_ja['words'] = test_ja['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
test_en['words'] = test_en['原始文本'].apply(lambda x: x.lower())

In [29]:
# 训练TFIDF和逻辑回归
pipline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression()
)
pipline.fit(
    train_ja['words'].tolist() + train_en['words'].tolist(),
    train_ja['意图'].tolist() + train_en['意图'].tolist()
)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercep

In [30]:
# 模型预测
test_ja['意图'] = pipline.predict(test_ja['words'])
test_en['意图'] = pipline.predict(test_en['words'])
test_en['槽值1'] = np.nan
test_en['槽值2'] = np.nan

test_ja['槽值1'] = np.nan
test_ja['槽值2'] = np.nan

# 写入提交文件
writer = pd.ExcelWriter('submit.xlsx')
test_en.drop(['words'], axis=1).to_excel(writer, sheet_name='英文_testA', index=None)
test_ja.drop(['words'], axis=1).to_excel(writer, sheet_name='日语_testA', index=None)
writer.save()
writer.close()