## 模型配置

In [111]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
from random import shuffle
import jieba
import re
import pandas as pd
import os
import sys

sess = sagemaker.Session()
role = get_execution_role()
bucket = sess.default_bucket()
prefix = 'blazingtext/Sentiment_classification'

## 将金融情感词典导入jieba库

In [112]:
with open('jieba.txt', 'r',encoding='gbk') as f:
    jieba_list = f.read().splitlines()
jieba.load_userdict(jieba_list)

## 处理通联数据 

In [113]:
# # 数据读取
# data = pd.read_csv('news_part.csv')
# data.drop_duplicates(subset=['news_title'],keep='first',inplace=True)
# data = data.applymap((lambda x:"".join(x.split()) if type(x) is str else x))
# data['lable'] =['__label__%i' % i for i in data['SENTIMENT']]
# data['aws_format'] = data['lable'] + '  ' + data['news_title']

# #取各类情感数据各5000条
# positive = data[data['SENTIMENT']==1].head(5000)
# negative = data[data['SENTIMENT']==-1].head(5000)
# neutral = data[data['SENTIMENT'] == 0].head(5000)

# #整理并打乱数据
# result = positive.append(negative)
# result = result.append(neutral)
# result = result.sample(frac=1)

# #将数据处理成AWS需要的format，保存到data_new中
# result['aws_format'].to_csv('data.txt',index=False,header=None,encoding='utf_8_sig')
# lines = open('data.txt').readlines() #打开文件，读入每一行
# file = open('data_new.txt','w') #打开你要写得文件pp2.txt
# for s in lines:
#     file.write(s.replace('\"','')) # replace是替换，write是写入
# file.close()

## 使用jieba将文本数据进行分词

In [114]:
file  = 'data_new.txt'
with open(file,encoding='utf-8') as f:
    lines = f.readlines()

labels = []
for line in lines:
    label = []
    line = line.split('  ')
    label.append(line[0])
    line[1] = re.sub(r"[\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——一！，;:：。？、~@#￥%……&*（）]+", "", line[1])
    label.extend(jieba.cut(line[1],cut_all=False))
    labels.append(label)
    
shuffle(labels)

## 将数据分割成训练集和验证集

In [115]:
t_train_data = labels[0:int(len(labels)*0.8)]
t_validation_data = labels[int(len(labels)*0.8):]

In [116]:
import csv
t_train_file = 'tt.train'
t_validation_file = 'tt.validation'

with open(t_train_file, 'w') as csvoutfile:
    csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
    csv_writer.writerows(t_train_data)
    
with open(t_validation_file, 'w') as csvoutfile:
    csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
    csv_writer.writerows(t_validation_data)

In [117]:
%%time

t_train_channel = prefix + '/train'
t_validation_channel = prefix + '/validation'

sess.upload_data(path='tt.train', bucket=bucket, key_prefix=t_train_channel)
sess.upload_data(path='tt.validation', bucket=bucket, key_prefix=t_validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, t_train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, t_validation_channel)

s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

CPU times: user 194 ms, sys: 25.6 ms, total: 219 ms
Wall time: 761 ms


In [118]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [119]:
region_name = boto3.Session().region_name
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


Using SageMaker BlazingText container: 501404015308.dkr.ecr.ap-northeast-1.amazonaws.com/blazingtext:latest (ap-northeast-1)


## 超参数配置 

In [120]:
t_bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)
t_bt_model.set_hyperparameters(mode="supervised",
                            buckets=3396748,
                            epochs=9,
                            min_count=5,
                            learning_rate=0.05,
                            vector_dim=84,
                            early_stopping=False,
                            min_epochs=5,
                            word_ngrams=2)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


## 数据导入

In [121]:
t_train_data = sagemaker.inputs.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
t_validation_data = sagemaker.inputs.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
t_data_channels = {'train': t_train_data, 'validation': t_validation_data}

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


## 模型训练

In [122]:
t_bt_model.fit(inputs=t_data_channels, logs=True)

2020-11-07 02:28:15 Starting - Starting the training job...
2020-11-07 02:28:17 Starting - Launching requested ML instances......
2020-11-07 02:29:32 Starting - Preparing the instances for training......
2020-11-07 02:30:33 Downloading - Downloading input data
2020-11-07 02:30:33 Training - Downloading the training image..[34mArguments: train[0m
[34m[11/07/2020 02:30:49 INFO 140164474562368] nvidia-smi took: 0.0251748561859 secs to identify 0 gpus[0m
[34m[11/07/2020 02:30:49 INFO 140164474562368] Running single machine CPU BlazingText training using supervised mode.[0m
[34m[11/07/2020 02:30:49 INFO 140164474562368] Processing /opt/ml/input/data/train/tt.train . File size: 13 MB[0m
[34m[11/07/2020 02:30:49 INFO 140164474562368] Processing /opt/ml/input/data/validation/tt.validation . File size: 3 MB[0m
[34mRead 2M words[0m
[34mNumber of words:  24798[0m
[34m##### Alpha: 0.0444  Progress: 11.14%  Million Words/sec: 21.78 #####[0m
[34m##### Alpha: 0.0389  Progress: 22.28%

## 模型部署

In [123]:
t_text_classifier = t_bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------------!

## 情感预测

In [202]:
def Sentiment_classification(sentences):
    sentences = sentences
    tokenized_sentences = [' '.join(jieba.cut(sentences,cut_all=False))]
    payload = {"instances" : tokenized_sentences}
    t_response = t_text_classifier.predict(json.dumps(payload))
    t_predictions = json.loads(t_response)
    predictions = re.sub(r"[\s+\!\/_$%^*()?;；:-【】+\"\']+|[+——一！，;:：。？、~@#￥%……&*（）]+", "",json.dumps(t_predictions, indent=0)).split(',')
    result = {}
    result['Sentence'] = sentences
    result['Probility'] = float(predictions[0])
    result['Category'] = int(predictions[1])
    return result

## 预测集样例观看 

In [287]:
news = pd.read_csv('news_prediction.csv')
news.head()    #查看news_title样例内容

Unnamed: 0,news_titles
0,2019年湘江金融发展峰会银行科技论坛落幕
1,易纲:金融风险整体收敛货币政策工具手段充足
2,电价形成机制发生“质变”下半年电力领域混改或成重头戏
3,三盛赶在五一给冯劲义送了一份礼物：三盛总裁
4,是谁夺走了我们的注意力？BBC：警惕“数码黑帮”


## 预测结果输出

In [288]:
for news_title in news['news_titles']:
    result = Sentiment_classification(news_title)
    if result['Probility'] >= 0.9 and result['Category'] == -1:
        print(result['Sentence'])
        print(result['Probility'])
        print(result['Category'])

康美300亿造假退市还是罚60万？
0.9991288781166077
-1
戴姆勒计划裁员至少1万人，占全球员工的3.3%
0.9934340715408325
-1
第一上海(00227.HK)2018年度纯利减少46%至3340.3万港元
0.9937437176704407
-1
特宝生物研发味淡:曾因业绩差终止辅导却想上科创板
0.9998844861984253
-1
德邦股份一季度净利亏损4905万元现金流量净额为-2.3亿元
0.9994333982467651
-1
经营现金流锐减3.1亿，华强方特与预收款不得不说的事
0.9636446237564087
-1
钴曾被疯抢如今价格腰斩连累寒锐钴业市值蒸发275亿
0.9908749461174011
-1
电解铜合同引纠纷海航期货计提7千多万坏账准备
0.9732795357704163
-1
网易有道递交招股书：丁磊持股30%上半年净亏1.68亿元
0.997651219367981
-1
康健国际医疗(03886)将继续与证监会沟通寻求尽快复牌
0.9511836767196655
-1
天安金交所产品逾期理赔无门天安财险履约险成幌子?
0.98860764503479
-1
汇创控股(08202.HK)延迟刊发年度业绩10月2日上午起停牌
0.9961422681808472
-1
5.9亿债务未还，天夏智慧16个银行账户被冻结
0.998969316482544
-1
法国巴黎银行将在比利时裁员800至1000人
0.9053449630737305
-1
龙润茶(02898-HK)中期亏损扩大不派息
0.9999605417251587
-1
康美药业：昔日白马变黑天鹅股债两市集体讨伐
0.9940826296806335
-1
立昂技术股份有限公司关于特定股东股份减持计划提前终止的公告
0.9998213648796082
-1
第一上海(00227.HK)2018年度纯利减少46%至3340.3万港元
0.9937437176704407
-1
平安好医生：投资者群出现匿名恶意诋毁公司不实文章已报案
0.9940109848976135
-1
均安控股(01559-HK)中期少赚15%不派息
0.994805634021759
-1
财联社3月1日讯，绿城中国预计全年净利同比下降5