In [3]:
#import keras
#import gensim
import jieba
import pandas as pd
#import tensorflow as tf
import numpy as np
import re
import sys
import codecs
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

#统计频率
def count(list):
    words = {}
    for w in list:
        if words.get(w):
            words[w] += 1
        else:
            words[w] = 1
    return words

#计算百分比
def percentage(data):
    s = np.sum(list(data.values()))
    res = {}
    for i in data.keys():
        res[i] = np.divide(data[i], s)
    return res
    
#分析训练数据
def analyse_traindata(train_data):
    print(percentage(count(train_data["subject"])))
    print(percentage(count(train_data["sentiment_value"])))
    #print(count(train_data["sentiment_word"]))
    return 0

#a="因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格"

#只保留中文    
def chinese_remained(line): 
    rule=re.compile("[^\u4e00-\u9fa5]")
    line=re.sub(rule,"",line)
    return line
#b=chinese_remained(a)
#print(b)

#分词 直接用结巴分词
def participle(line):
    words = jieba.cut(line,cut_all=False)    
    res = ''
    for w in words:
        if w != '\t':
            res += w + " "
    return res.strip()
#c=participle(b)
#print(c)  

#删除停用词 停用词库的选择？
def delete_stopwords(line,stop_words_source):
    stopwords = [w.strip() for w in codecs.open(stop_words_source, 'r', encoding='utf-8').readlines()]
    words = line.split(' ')          
    res = ''
    for w in words:
        w = w.strip()
        if w not in stopwords:
            if w != '\t':
                res += w + " "
    return res.strip()
#d=delete_stopwords(c,stop_words_source)
#print(d)

#数据预处理 主要处理content
def pretreatment(train_data):
    num_data=train_data.shape[0]
    for i in range(num_data):
        tmp=delete_stopwords(participle(chinese_remained(train_data['content'][i])),stop_words_source)
        train_data.loc[i, "content"] = tmp
        #print(tmp)
    return train_data

#训练两个多分类器 ？sentiment_word不用管了
#svm
def classification(x_train,x_validation, y_train, y_validation,test_data):
    svm = Pipeline([('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter= 5)),
    ])
    svm.fit(x_train, y_train)  
    validation_predicted = svm.predict(x_validation)
    print('SVM correct prediction: {:4.4f}'.format(np.mean(validation_predicted == y_validation)))
    test_predicted=svm.predict(test_data.values)
    return test_predicted


if __name__ == '__main__':
    stop_words_source='./stopWord.txt'
    #subject_type=["动力","价格","内饰","配置","安全性","外观","操控","油耗","空间","舒适性"]    
    train_data=pd.read_csv('./train.csv',encoding='utf-8')
    test_data=pd.read_csv('./test_public.csv',encoding='utf-8')
    #计算缺失值
    #train_data[train_data.isnull().values==True]
    #train_data.shape[0] - train_data.count()
    #只缺失sentiment_word     7778
    print("load data:")
    print(train_data.head())
    print(test_data.head())
    #print(train_data['content'])    
    print("analyse data:")
    analyse_traindata(train_data)
    print("pretreatment:")
    train_data_p=pretreatment(train_data)
    print(train_data_p.head())
    test_data_p=pretreatment(test_data)
    print(test_data_p.head())
    #划分训练集合测试集 抽五分之一作为验证集
    xsub_train, xsub_validation, ysub_train, ysub_validation = train_test_split(train_data_p['content'].values, train_data_p['subject'].values, test_size=0.2)
    xsen_train, xsen_validation, ysen_train, ysen_validation = train_test_split(train_data_p['content'].values, train_data_p['sentiment_value'].values, test_size=0.2)
    #进行训练
    print("train subject:")
    subject_predicted=classification(xsub_train, xsub_validation, ysub_train, ysub_validation,test_data['content'])
    print("train sentiment_value:")
    sentiment_value_predicted=classification(xsen_train, xsen_validation, ysen_train, ysen_validation,test_data['content'])
    #输出结果到csv中
    print("output")
    with codecs.open('./output.csv', "w", "utf-8") as outfile:
        outfile.write("content_id,subject,sentiment_value,sentiment_word\n")
        cnt = 1
        for content_id, subject, sentiment_value in zip(test_data["content_id"], subject_predicted, sentiment_value_predicted):
            outfile.write("{},{},{},\n".format(content_id, subject, sentiment_value))
            cnt += 1    
    print("finished")

load data:
         content_id                                            content  \
0  vUXizsqexyZVRdFH           因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。   
1  4QroPd9hNfnCHVt7      四驱价格貌似挺高的，高的可以看齐XC60了，看实车前脸有点违和感。不过大众的车应该不会差。   
2  QmqJ2AvM5GplaRyz  斯柯达要说质量，似乎比大众要好一点，价格也低一些，用料完全一样。我听说过野帝，但没听说过你说...   
3  KMT1gFJiU4NWrVDn           这玩意都是给有钱任性又不懂车的土豪用的，这价格换一次我妹夫EP020可以换三锅了   
4  nVIlGd5yMmc37t1o                            17价格忒高，估计也就是14-15左右。      

  subject  sentiment_value sentiment_word  
0      价格                0             影响  
1      价格               -1              高  
2      价格                1              低  
3      价格               -1           有钱任性  
4      价格               -1              高  
         content_id                          content
0  XuPwKCnA2fqNh5vm             欧蓝德，价格便宜，森林人太贵啦！    
1  2jNbDn85goX3IuPE                楼主什么时候提的车，南昌优惠多少啊
2  hLgEADQ8sUnvGFK9         吉林，2.5优惠20000，送三年九次保养，贴膜
3  nZmM7LQsfr03wUaz     便宜2万的豪华特装，实用配制提升，优惠还给力，确实划算。
4  pwd8MnrthDqL