In [7]:
import time
import datetime
import pickle
import pandas as pd
import sqlalchemy as sq
import re

In [4]:
engine = sq.create_engine("mysql+pymysql://root:next++4e@localhost/Alternative_DB?host=localhost?port=3306")
conn = engine.connect()

In [55]:
class recommend_extracter:
    def __init__(self,conn,build_db_recommend = False):
        
        self.conn_extracter = conn
        if build_db_recommend:
            self.build_recommend_db()
    
    def build_recommend_db(self):
        # Function: Set up a database to store accuracy with the following setting. 
        # Note that: All functions in this class will follow this setting, pls set up ur database accordingly to avoid error
        self.conn_extracter.execute("CREATE TABLE `Alternative_DB`.`recommend`(`url` varchar(700) NOT NULL,`id` int(11) NOT NULL AUTO_INCREMENT,`company` varchar(20) DEFAULT NULL,`news type` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,`date` datetime DEFAULT NULL,`title` varchar(100) DEFAULT NULL,`Cu_fact` mediumtext COMMENT '\n',`Cu_action` mediumtext CHARACTER SET utf8 COLLATE utf8_general_ci,`Zn_fact` mediumtext,`Zn_action` mediumtext,`Pb_fact` mediumtext,`Pb_action` mediumtext,`Al_fact` mediumtext,`Al_action` mediumtext,`Ni_action` mediumtext,`Ni_fact` mediumtext,`Xi_action` mediumtext,`Xi_fact` mediumtext,`Other` mediumtext CHARACTER SET utf8 COLLATE utf8_general_ci,PRIMARY KEY (`url`),KEY(`id`));")
    
    def cleaning(self,text,stop_words=[]):
        # Function: This function will split article into paragraph and replace all stop_words into empty string
        # Input: Text is the content, stop_words is list of use characters we want to replace into empty string
        
        # Split article into paragraphs by common split characther 
        command_split = '\n' + "|" +'；'
        paragraph_lst = re.split(command_split,text)
        result =[]
        
        # For each paragraphs, replace the stop_words
        for paragraph in paragraph_lst:
            for word in stop_words:
                paragraph = paragraph.replace(word,"")
            if len(paragraph)>0:
                result.append(paragraph)
        return result
    
    def classify(self,paragraph_lst):
        # Function: This function will classify each paragraph
        # Inputs: paragraph_lst is a list of paragraphs
        
        # Create a dictionary to store respective sentence
        metal = ['铜','镍','铝','锌','铅','锡']
        category ={}
        category['other']=[]
        for metal_type in metal:
            category[metal_type]=[]
        
        
        for paragraph in paragraph_lst:
            # If the paragraph mention specific metals, we record under the metal dictinoary
            for metal_type in metal:
                check = 0
                if metal_type in paragraph:
                    category[metal_type].append(paragraph)
                    check = 1
                    
            # If it does not record under any of the metals, we will record it in other
            if check == 0:
                category['other'].append(paragraph)
    
        # Combine paragraphs in other into one long paragraph
        new_other = ""
        for paragraph in category['other']:
            new_other = new_other+'\n'+paragraph
    
        category['other'] = new_other
        return category
    
    def extract_recommendation(self,paragraph_lst,split_word,key_words=[]):
        
        # Function: This function will extract recommendation and fact from given paragraph based on certain key_words
        # Inputs: paragraph_lst is list of paragraphs, split_word is list of chracters that can split paragraph into sentences,
        # keyword is list of words to identify recommendation
        
        fact = ""
        action = ""
        
        for index in range(0,len(paragraph_lst)):
            # Check each paragraph
            para = paragraph_lst[index]
            modification = []
            for key in key_words: 
                
                # Check whether the paragraph contain any keyword
                if key in para:
                    
                    # Split the para into sentences 
                    
                    command_split = split_word[0]
                    for split in split_word[1:]:
                        command_split = command_split + "|" +split
                    sentences = re.split(command_split,para)
                    for sentence in sentences:
                        # Check whether we got the action sentence already
                        if key in sentence and sentence not in modification:
                            action = action + '\n' + sentence
                            modification.append(sentence)
                            
            # Delete the action sentence from the paragraph
            for i in modification:
                para = para.replace(i,"")
            fact = fact + '\n'+para
        return (fact,action)
    
    def second_clean(self,fact_action_tuple, first_filter, second_filter,final_check):
        
        # Function: This fuction will further clean the action list generated from extract_recommendation
        # Inputs: fact_action_tuple is result gotten from extract_recommendation
        #  firs_filter, second_filter and final_check are list of keywords that action sentence must contain
        
        fact = fact_action_tuple[0]
        selected = fact_action_tuple[1].split('\n')
        action_lst = []
        result = ""
        
        # Check each potential action whehter contain keyword in filter
        for potential in selected:
            
            # status is to record whether it become action 
            status = 0
            
            # Previous is to record the number of inital recommendation sentences
            previous = len(action_lst)
            
            # Check whether each sentence contain first level keyword
            for key in first_filter:
                if key in potential:
                    action_lst.append(potential)
                    status = 1
                    break
                    
            # If nothing in first_filter can filter out, use second level
            if len(action_lst)== previous:
                for key in second_filter:
                    if key in potential:
                        action_lst.append(potential)
                        status = 1
                        break
                        
            if status==0:
                fact = fact + '\n'+ potential   
                
        for choosen in action_lst:
            status = 0
            for check in final_check:
                if check in choosen:
                    result = result+'\n'+choosen
                    status = 1
                    break
            if status==0:
                fact = fact + '\n'+ choosen
        return (fact,result)
    
    def extract(self,df_content,keyword,first,secondary,split,stop_words=[],update = True):
        # Function: This function will extract recommendation and fact from dataframe of 
        #           content (gotten from live html extracter). Moreover, it will classify
        #           the list of recommendation and facts according to metal type
        
            

        result = {}
        # Create dictionary input
        for key in ['url','company', 'news type', 'date', 'title', 'Cu_fact', 'Cu_action','Zn_fact', 
                    'Zn_action', 'Pb_fact', 'Pb_action', 'Al_fact', 'Al_action',
                    'Ni_action', 'Ni_fact', 'Xi_action', 'Xi_fact', 'Other']:
            result[key] = []
            
        metal_lst = ['铜','镍','铝','锌','铅','锡']
        fact_lst = ['Cu_fact','Ni_fact','Al_fact','Zn_fact','Pb_fact','Xi_fact']
        action_lst = ['Cu_action','Ni_action','Al_action','Zn_action','Pb_action','Xi_action']

        for url,com, news_type, date, title, content in zip(df_content['url'],df_content['company'],df_content['type'],
                                                            df_content['date'],df_content['title'],df_content['content']):
            result['url'].append(url)
            result['company'].append(com)
            result['news type'].append(news_type)
            result['date'].append(date)
            result['title'].append(title)
            categories = self.classify(self.cleaning(content,stop_words=stop_words))
            
            # extract recommendation and clean it for each metal
            for metal,fact,action in zip(metal_lst,fact_lst,action_lst):
                # First level keyword
                new_first = first +[metal+'价']
                
                # Extract recommendation
                processing = self.extract_recommendation(categories[metal],split,keyword)  
                
                # Clean recommendation
                cleaned_rec = self.second_clean(processing,new_first,secondary,keyword)
                
                # Record result
                result[fact].append(cleaned_rec[0])
                result[action].append(cleaned_rec[1])
                
            result['Other'].append(categories['other'])
        
        df_result = pd.DataFrame(result)
        if update:
            # Check whether table in database has been created 
            check = self.conn_extracter.execute("SHOW TABLES LIKE 'recommend';")
            if  not check.first():
                print('Database not exist, please use build_recommend_db function')
            else:
                df_result.to_sql(name='recommend', con=self.conn_extracter, if_exists='append',index=False)
            
        return df_result

In [56]:
recommend = recommend_extracter(conn)
df_content = pd.read_sql('Select * from content',conn)

In [60]:
keyword = ['震荡','偏强','观望','做多','轻仓','反弹','偏弱','上涨','企稳','承压','卖出','短线','短多','整理','止损',
           '多仓','突破','支持','上行','空间','回补','低位','悲观',
           '回落','弱势','抛售','回调','有望','走高','多单','上移','多头','走强','盘整','波动','上升','支撑','空单']
first = ['认为','预计','预测','预期','建议','观点','关注','强调','交易','铜价','多头','空头']
secondary = ['操作','短期','短线']
split = ['。','；']
stop_words = ['\t']
df_result = recommend.extract(df_content,keyword,first,secondary,split,stop_words=stop_words)

In [61]:
df_result

Unnamed: 0,url,company,news type,date,title,Cu_fact,Cu_action,Zn_fact,Zn_action,Pb_fact,Pb_action,Al_fact,Al_action,Ni_action,Ni_fact,Xi_action,Xi_fact,Other
0,http://www.gtaxqh.com/html/2017/mrsp_0831/1310...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-08-31,\n。。。。。铝持货商稳定出货，现货贴水小幅扩大，贸易商接货积极，为换月周期做准备，下游企业...,\n从盘面上看，黑色系整体趋势依然震荡向上，不过短期分化、波动仍较为剧烈，操作上谨慎为宜，关...,\n。。。。。铝持货商稳定出货，现货贴水小幅扩大，贸易商接货积极，为换月周期做准备，下游企业...,\n从盘面上看，黑色系整体趋势依然震荡向上，不过短期分化、波动仍较为剧烈，操作上谨慎为宜，关...,\n,,\n。。。。。铝持货商稳定出货，现货贴水小幅扩大，贸易商接货积极，为换月周期做准备，下游企业...,\n从盘面上看，黑色系整体趋势依然震荡向上，不过短期分化、波动仍较为剧烈，操作上谨慎为宜，关...,\n从盘面上看，黑色系整体趋势依然震荡向上，不过短期分化、波动仍较为剧烈，操作上谨慎为宜，关...,\n。。。。。铝持货商稳定出货，现货贴水小幅扩大，贸易商接货积极，为换月周期做准备，下游企业...,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
1,http://www.gtaxqh.com/html/2017/mrsp_0901/1312...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-01,\n。。。。。现阶段陈豆库存所剩无几，以粮库出库价格指导，所以绝大部分时间市场缺乏指引性消息...,\n周五，沪胶震荡，主力1801合约成交量萎缩，持仓量略增，基差依然巨大，胶市极度悲观情绪有...,\n。。。。。现阶段陈豆库存所剩无几，以粮库出库价格指导，所以绝大部分时间市场缺乏指引性消息...,\n锌今炼厂出货正常，锌价维持区间震荡，贸易商出货积极，但接货者偏少，市场交投平平，福建下游...,\n,,\n。。。。。现阶段陈豆库存所剩无几，以粮库出库价格指导，所以绝大部分时间市场缺乏指引性消息...,\n周五，沪胶震荡，主力1801合约成交量萎缩，持仓量略增，基差依然巨大，胶市极度悲观情绪有...,\n从盘面上看，本钢高炉生产事故引发市场对供应进一步收缩的担忧，刺激做多热情再次释放，目前黑...,\n。。\n\n沪镍走势偏强，俄镍现货升水继续坚挺，钢厂按需采购为主，下游合金电镀厂采购不积...,\n周五，沪胶震荡，主力1801合约成交量萎缩，持仓量略增，基差依然巨大，胶市极度悲观情绪有...,\n。。。。。现阶段陈豆库存所剩无几，以粮库出库价格指导，所以绝大部分时间市场缺乏指引性消息...,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
2,http://www.gtaxqh.com/html/2017/mrsp_0905/1315...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-05,\n,,\n,,\n,,\n,,,\n,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
3,http://www.gtaxqh.com/html/2017/mrsp_0906/1317...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-06,\n。。。电解铝持货商出货态度积极，部分贸易商逢低接货，接货力度远不及昨日，下游企业接货意愿...,\n从盘面上看，短期市场节奏转换较快，波动依然较为剧烈，黑色系整体出现走弱迹象，关注螺纹能否...,\n。。。电解铝持货商出货态度积极，部分贸易商逢低接货，接货力度远不及昨日，下游企业接货意愿...,\n从盘面上看，短期市场节奏转换较快，波动依然较为剧烈，黑色系整体出现走弱迹象，关注螺纹能否...,\n,,\n。。。电解铝持货商出货态度积极，部分贸易商逢低接货，接货力度远不及昨日，下游企业接货意愿...,\n从盘面上看，短期市场节奏转换较快，波动依然较为剧烈，黑色系整体出现走弱迹象，关注螺纹能否...,\n周三，沥青期货盘面继续以震荡调整为主，在中石化下调价格之后，市场再度持稳，市场观望情绪再...,\n贸易商间询价交投明显积极，下游采购积极性提高，市场成交活跃。。周三郑糖中幅下挫，8月产销...,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
4,http://www.gtaxqh.com/html/2017/mrsp_0907/1318...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-07,\n周四黑色系继续分化，钢矿惯性下挫，焦炭相对较强。。。。锌冶炼厂正常出货，因昨日下游已备较...,\n从盘面上看，市场风向转变，商品市场整体走弱，黑色系短期仍有回调压力，关注螺纹20日线支撑,\n周四黑色系继续分化，钢矿惯性下挫，焦炭相对较强。。。。锌冶炼厂正常出货，因昨日下游已备较...,\n从盘面上看，市场风向转变，商品市场整体走弱，黑色系短期仍有回调压力，关注螺纹20日线支撑,\n,,\n周四黑色系继续分化，钢矿惯性下挫，焦炭相对较强。。。。锌冶炼厂正常出货，因昨日下游已备较...,\n从盘面上看，市场风向转变，商品市场整体走弱，黑色系短期仍有回调压力，关注螺纹20日线支撑,\n从盘面上看，市场风向转变，商品市场整体走弱，黑色系短期仍有回调压力，关注螺纹20日线支撑,\n周四黑色系继续分化，钢矿惯性下挫，焦炭相对较强。。。。锌冶炼厂正常出货，因昨日下游已备较...,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
5,http://www.gtaxqh.com/html/2017/mrsp_0908/1320...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-08,\n沪铜午后亦跳水下行，现货市场保值盘获利离场增多，升水逐步下调，下游成交整体转平淡。电解铝...,,\n沪铜午后亦跳水下行，现货市场保值盘获利离场增多，升水逐步下调，下游成交整体转平淡。电解铝...,\n对豆类油脂市场影响偏空，鉴于此，我们建议操作上以短空对待，中线来看整固区间尚待突破，风险...,\n,,\n沪铜午后亦跳水下行，现货市场保值盘获利离场增多，升水逐步下调，下游成交整体转平淡。电解铝...,,\n从盘面上看，市场情绪反复，黑色系出现剧烈波动，部分品种短期回调幅度较大，不宜过分看空，关...,\n。。周五沪镍午后大幅下挫，金川俄镍价差维持200元附近，俄镍升水继续坚挺，下游采购较为谨...,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
6,http://www.gtaxqh.com/html/2017/mrsp_0912/1324...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-12,\n。铝持货商逢高出货积极，现货贴水较昨大幅扩大，中间商接货活跃，下游企业畏高选择按刚需采购...,,\n。铝持货商逢高出货积极，现货贴水较昨大幅扩大，中间商接货活跃，下游企业畏高选择按刚需采购...,,\n,,\n。铝持货商逢高出货积极，现货贴水较昨大幅扩大，中间商接货活跃，下游企业畏高选择按刚需采购...,,\n从盘面上看，市场情绪回暖，商品整体走强，钢矿持续下探后企稳反弹，煤焦再次冲击前高，预计黑...,\n周二黑色系普涨，原料表现相对较好。。\n\n周二沪镍震荡盘整，现货方面，金川俄镍升水较昨...,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
7,http://www.gtaxqh.com/html/2017/mrsp_0913/1325...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-13,\n。。。。沪铜升水继续回升，现货市场上下游询价积极，市场成交整体活跃。。锌炼厂正常出货，贸...,\n周三沪镍震荡回调，俄镍金川现货升水持稳，下游普遍观望情绪较浓，采购意愿较低，国内部分不锈...,\n。。。。沪铜升水继续回升，现货市场上下游询价积极，市场成交整体活跃。。锌炼厂正常出货，贸...,\n周三沪镍震荡回调，俄镍金川现货升水持稳，下游普遍观望情绪较浓，采购意愿较低，国内部分不锈...,\n,,\n。。。。沪铜升水继续回升，现货市场上下游询价积极，市场成交整体活跃。。锌炼厂正常出货，贸...,\n周三沪镍震荡回调，俄镍金川现货升水持稳，下游普遍观望情绪较浓，采购意愿较低，国内部分不锈...,\n周三沪镍震荡回调，俄镍金川现货升水持稳，下游普遍观望情绪较浓，采购意愿较低，国内部分不锈...,\n。。。。沪铜升水继续回升，现货市场上下游询价积极，市场成交整体活跃。。锌炼厂正常出货，贸...,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
8,http://www.gtaxqh.com/html/2017/mrsp_0914/1327...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-14,\n沪铜现货升水继续回升，下游入市接货意愿较浓，市场供需均较为活跃。锌炼厂积极出货\n,,\n沪铜现货升水继续回升，下游入市接货意愿较浓，市场供需均较为活跃。锌炼厂积极出货\n,,\n,,\n。整体成交情况与昨日基本持平。。豆类油脂今日整体摆脱USDA供需报告利空压制，纷纷强劲拉...,\n再考虑到临池豆油季节性走升的潜在利多支撑，我们短期对于油脂类维持震荡偏强的判断\n周四，...,\n从盘面上看，经济数据不及预期，黑色系整体面临下行压力，铁矿石由于供应端宽松依然偏弱，预计...,\n。。\n\n周四黑色系全线回调，铁矿石跌幅较大,,\n,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
9,http://www.gtaxqh.com/html/2017/mrsp_0915/1328...,国投安信,After Market,2019-05-28,国投安信期货每日收评\n2017-09-15,\n。。。锌炼厂虽持续保持正常出货，但国产普通品牌流通货源仍偏少，主要以长单交投为主，下游多...,\n沪铜偏弱震荡运行，最后交易日铜现货升水持稳高位，现货市场成交整体尚可\n豆类油脂今日整体...,\n。。。锌炼厂虽持续保持正常出货，但国产普通品牌流通货源仍偏少，主要以长单交投为主，下游多...,\n沪铜偏弱震荡运行，最后交易日铜现货升水持稳高位，现货市场成交整体尚可\n豆类油脂今日整体...,\n,,\n。。。锌炼厂虽持续保持正常出货，但国产普通品牌流通货源仍偏少，主要以长单交投为主，下游多...,\n沪铜偏弱震荡运行，最后交易日铜现货升水持稳高位，现货市场成交整体尚可\n豆类油脂今日整体...,\n周五沪镍继续下挫，俄镍升水较昨日持稳，市场上观望情绪较浓，部分下游多看跌后期镍价，成交整...,\n周五黑色系惯性下挫，焦煤、焦炭跌幅较大。。\n,\n沪铜偏弱震荡运行，最后交易日铜现货升水持稳高位，现货市场成交整体尚可\n豆类油脂今日整体...,\n。。。锌炼厂虽持续保持正常出货，但国产普通品牌流通货源仍偏少，主要以长单交投为主，下游多...,\n当前位置：首页>研究服务>综合类晨报>每日收评\n国投安信期货每日收评\n来源： 发布时...
