In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import sqlalchemy as sq
from scipy.stats import norm
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.metrics import accuracy_score
pd.set_option('display.max_columns', None)

In [2]:
metal_data_path = '../big/NExT/Data/Version 1/LME/'
#metal_data_path - '../NExT/Data/Version 1/LME/'
sentiment_path = './metal_score/'
score_path = './score/'
accur_path = './accur/'
metal_list = ['Cu', 'Zn', 'Pb', 'Al', 'Ni', 'Xi']
window_lst = [1, 3, 5, 7, 10, 15, 20]

In [3]:
#the construction of the metal dict is just for the later use, the list is tuple of column name, the certain metal path and the 
#sentiment score of certain metal
metal_dict = {}
metal_dict['Cu'] = ['LMCADY',
                   metal_data_path + 'LMCADY.csv',
                   sentiment_path + 'Cu_sentiment.csv']
metal_dict['Al'] = ['LMAHDY',
                   metal_data_path + 'LMAHDY.csv',
                   sentiment_path + 'Al_sentiment.csv']
metal_dict['Zn'] = ['LMZSDY',
                   metal_data_path + 'LMZSDY.csv',
                   sentiment_path + 'Zn_sentiment.csv']
metal_dict['Pb'] = ['LMPBDY',
                   metal_data_path + 'LMPBDY.csv',
                   sentiment_path + 'Pb_sentiment.csv']
metal_dict['Ni'] = ['LMNIDY',
                   metal_data_path + 'LMNIDY.csv',
                   sentiment_path + 'Ni_sentiment.csv']
metal_dict['Xi'] = ['LMSNDY',
                   metal_data_path + 'LMSNDY.csv',
                   sentiment_path + 'Xi_sentiment.csv']

#in case you need to do some experiment, construct the relevant key-value pair for u.
for met in metal_list:
    metal_dict[met+'_test'] = metal_dict[met]
    

In [4]:
#the bin of differt window
tier_dict = {}
tier_dict['1d'] = [[0.159,0.359,0.661,0.841], 5]
tier_dict['3d'] = [[0.159,0.353,0.661,0.841], 5]
tier_dict['5d'] = [[0.159,0.329,0.661,0.841], 5]
tier_dict['7d'] = [[0.159,0.327,0.663,0.841], 5]
tier_dict['10d'] = [[0.159,0.329,0.661,0.841], 5]
tier_dict['15d'] = [[0.159,0.289,0.681,0.841], 5]
tier_dict['20d'] = [[0.159,0.279,0.671,0.841], 5]

In [5]:
def get_price(metal_path, metal_columns, period):
    
    price = pd.read_csv(metal_path)
    price['Index'] =  pd.to_datetime(price['Index'])
    price['return_1d'] = (price[metal_columns]/price[metal_columns].shift(1)-1)*100
    price.dropna(inplace = True)
    
    for i in window_lst:
        price['return_{}d'.format(i)] = ((price[metal_columns] / price[metal_columns].shift(i)) -1)*100
    
    for i in window_lst:
        price['Std{}d_20'.format(i)] = (price['return_{}d'.format(i)].shift(1).rolling(period).std())
        
    price_forward = price.copy()
    
    for i in window_lst:
        price_forward['return_{}d'.format(i)] = price_forward['return_{}d'.format(i)].shift(-i)
    return price, price_forward 

In [6]:
def get_sentiment(metal_sentiment_path):
    sentiment = pd.read_csv(metal_sentiment_path)
    sentiment['Sentiment_article'] = sentiment['Sentiment_article']*100
    sentiment['date'] = sentiment['date'].apply(lambda x: pd.to_datetime(x).floor('D'))

    sentiment.dropna(inplace=True)
    sentiment.sort_values('date',axis=0,inplace = True)
    return sentiment

In [7]:
def get_discrete_score(price_sentiment):
    for i in window_lst:
        target = 'return_{}d'.format(i)
        output_name = 'discrete_{}d'.format(i)
        mean = 0
        std = 'Std{}d_20'.format(i)
        tier_lst = tier_dict['{}d'.format(i)][0]
        tier = tier_dict['{}d'.format(i)][1]
        price_sentiment[output_name] = price_sentiment.apply(lambda x: discrete(x[target],tier,tier_lst,mean,x[std]),axis=1)-2
        price_sentiment[output_name] = price_sentiment[output_name].apply(give_sign)
    return price_sentiment

In [8]:
def discrete(target,num_tier,tier_lst,mean,std):
        tier_point =[]
        for i in tier_lst:
            tier_point.append(norm.ppf(i, loc=mean, scale=std))
        
        decided = 0
           
        for i in range(num_tier-1):
            if target<=tier_point[i]:
                decided = 1
                break
        
        if decided ==0:
            return num_tier-1
        else:
            return i

In [9]:
def give_sign (data):
    if data>0:
        return 1
    elif data<0:
        return -1
    else:
        return 0

In [10]:
def division_method(x, y):
    if y == 0:
        return 0
    else:
        return x/y

In [11]:
import pandas as pd
class Score(object):
    
    def __init__(self, metal, build_db_accur = False,build_db_sent = False):
        
        self.metal = metal
        self.period_list = [1, 3, 5, 7, 10, 15, 20]
        
        self.snt_tier = 3
        self.snt_tier_lst = [0.309,0.691]
        
        self.raw_snt_mean = price_sentiment['Sentiment_article'].mean()
        self.raw_snt_std  = price_sentiment['Sentiment_article'].std()

        # default mean and std for adjusted sentiment score is based on 10438 recommendations from 2008 - 2016
        
        self.adjusted_snt_mean = {}
        self.adjusted_snt_std = {} 
        
        for i in self.period_list:
            self.adjusted_snt_mean['{}d'.format(i)] = self.raw_snt_mean
            self.adjusted_snt_std['{}d'.format(i)] = self.raw_snt_std
            
        self.accur_names = ['url', 'date', 'company', 'score', 'discrete_score', 
                            'accur_same_pos', 'accur_same_neg', 'accur_neu', 'accur_rev_pos', 
                            'accur_rev_neg', 'prec_horizon']
        
        self.score_names = ['date', 'score', 'discrete_score', 'horizon']
        
        if build_db_accur:
            self.build_accur_db(self.metal)
            
        if build_db_sent:
            self.build_sent_db(self.metal)
        

    def build_sent_db(self, metal): 
        result = pd.DataFrame(columns=self.score_names)
        result.to_csv(score_path + '{}_score.csv'.format(metal))
        
    def build_accur_db(self, metal):
        result = pd.DataFrame(columns=self.accur_names)
        result.to_csv(accur_path + '{}_accur.csv'.format(metal))

    def compute_mean_std(self):
        #compute the adjusted mean and std
        for i in self.period_list:
            result = pd.read_csv(score_path + '{}_score.csv'.format(self.metal))
            result = result[result['horizon'] == i].reset_index(drop=True)

            self.adjusted_snt_mean['{}d'.format(i)] = result['score'].mean()
            self.adjusted_snt_std['{}d'.format(i)] = result['score'].std()
        return self.adjusted_snt_mean, self.adjusted_snt_std

    def update_tier_lst(self,num_lst):
        # Function: this function will update the range of tier classes
        # Input : num_lst is a list with len size == self.tier-1
        if len(num_lst)!= (self.snt_tier-1):
            raise Exception('len(num_lst) does not match (tier class -1)')
        else:
            self.snt_tier_lst = sorted(num_lst)
            print('updated')

    def cal_score(self,com,score,date,accur_horizon,prec_horizon,adjust_mean=False, adjust_std=False, threshold = 2,update = True):

        date = pd.to_datetime(date).strftime('%Y-%m-%d')
        
        if adjust_mean:
            self.adjusted_snt_mean = adjust_mean
        
        if adjust_std:
            self.adjusted_snt_std = adjust_std
 
        num = len(com)
        realibility_lst = []
        score_lst = []

        if num > threshold:
            for cur_com,cur_score in zip(com,score):

                dis_score = discrete(cur_score,self.snt_tier,self.snt_tier_lst,self.raw_snt_mean,self.raw_snt_std)-1 
                df_history = pd.read_csv(accur_path + '{}_accur.csv'.format(self.metal))
                df_history['date'] = pd.to_datetime(df_history['date'])
                df_history = df_history[df_history['company'] == cur_com]
                df_history = df_history[df_history['prec_horizon'] == prec_horizon]
                df_history = df_history[df_history['discrete_score'] == dis_score]
                df_history = df_history[df_history['date'] < date]
                
                if len(df_history)< accur_horizon:
                    continue

                num+=1
                realibility_same = 0
                realibility_rev = 0

                if dis_score == 0:
                    realibility_same = division_method(df_history['accur_neu'].sum(), df_history['accur_neu'].count())
                elif dis_score>0:

                    realibility_same = division_method(df_history['accur_same_pos'].sum(), df_history['accur_same_pos'].count())

                    realibility_rev = division_method(df_history['accur_rev_pos'].sum(), df_history['accur_rev_pos'].count())

                else:
                    realibility_same = division_method(df_history['accur_same_neg'].sum(), df_history['accur_same_neg'].count())

                    realibility_rev = division_method(df_history['accur_rev_neg'].sum(), df_history['accur_rev_neg'].count())


                if realibility_same>realibility_rev:
                    realibility_lst.append(np.exp(realibility_same))
                    score_lst.append(cur_score)
                elif realibility_same==realibility_rev:
                    realibility_lst.append(np.exp(realibility_same))
                    score_lst.append(0)
                else:
                    realibility_lst.append(np.exp(realibility_rev))
                    score_lst.append(-cur_score)
            total_real = np.sum(realibility_lst)
            final_score = 0

            for cur_real,cur_score in zip(realibility_lst,score_lst):
                final_score += cur_score*cur_real
            
            final_score = division_method(final_score, total_real)
            
            key = '{}d'.format(prec_horizon)
            if key not in self.adjusted_snt_mean:
                raise Exception('The prediction horizon is not included')
            else:
                mean = self.adjusted_snt_mean[key]
                std = self.adjusted_snt_std[key]
                
            final_discrete_score = discrete(final_score,self.snt_tier,self.snt_tier_lst,mean,std)-1 
        else:
            #print('Does not have enough recommendation')
            final_score = None
            final_discrete_score = None
        
        ans = [date, final_score, final_discrete_score, prec_horizon]
        return_df = pd.DataFrame([ans], columns=self.score_names)

        return return_df
         
    def update_accur(self,url,date,com,horizon,score,target):
        # Function: This function will update the score and accuracy in the database
        # Input: date,com,horizon,score and target are list or series with equal len size.
        # com is company name, horizon is how many days are we predicting, score is sentiment score 
        # target is discreted n days return ratio 
        
        # Check if database exist
#        result = self.conn_accur.execute("SHOW TABLES LIKE '{}_accur';".format(self.metal))
        
#        result = pd.read_csv('./accur/{}_accur.csv'.format(self.metal))
#        if  not result.first():
#            raise Exception('Database not exist, please use build_accur_db function')
        
        
#        col_name = '(url,date, company,score, discrete_score, accur_same_pos, accur_same_neg,accur_neu, accur_rev_pos, accur_rev_neg,prec_horizon)'
        df_value = []
        for cur_url,cur_date, cur_com, cur_horizon, cur_score,cur_target in zip(url,date,com,horizon,score,target):
            
            col_value = []
            col_value.append(cur_url)
            
            cur_date = pd.to_datetime(cur_date).floor('D')
            col_value.append(cur_date)
            col_value.append(cur_com)
            col_value.append(cur_score)
            
            dis_score = discrete(cur_score,self.snt_tier,self.snt_tier_lst,self.raw_snt_mean,self.raw_snt_std)-1
            col_value.append(dis_score)
            
            #predict and target are larger than 0
            #predict and target are less than 0
            #predict value and target are also 0
            #disobey rule 0
            #disobey rule 1
            
            accur_value = [None]*5
            
            if dis_score == 0:
                if dis_score == cur_target:
                    accur_value[2] = True
                else:
                    accur_value[2] = False
                    
            elif dis_score<0:
                
                if dis_score == cur_target:
                    accur_value[1] = True
                    accur_value[4] = False
                elif dis_score == -cur_target:
                    accur_value[4] = True
                    accur_value[1] = False
                else:
                    accur_value[1] = False
                    accur_value[4] = False
                    
            elif dis_score>0:
                
                if dis_score == cur_target:
                    accur_value[0] = True
                    accur_value[3] = False
                elif dis_score == -cur_target:
                    accur_value[3] = True
                    accur_value[0] = False
                else:
                    accur_value[0] = False
                    accur_value[3] = False
            
            for value in accur_value:
                col_value.append(value)
            
            col_value.append(cur_horizon)
            df_value.append(col_value)
        result_df = pd.DataFrame(df_value, columns=self.accur_names)

        return result_df

In [None]:
metal_wait_lst =  ['Cu', 'Pb', 'Al', 'Ni', 'Xi', 'Cu']
threshold_lst = [8, 10]
period_lst = [20, 15, 20, 25]

for met in metal_wait_lst:
    print('current metal is : {}'.format(met))
    metal = met
    metal_columns = metal_dict[metal][0]
    metal_path = metal_dict[metal][1]
    metal_sentiment_path = metal_dict[metal][2]
    
    print('getting sentiment score')
    sentiment = get_sentiment(metal_sentiment_path)
    
    mf = metal if '_test' not in metal else metal.split('_')[0]
    
    for period in period_lst:
        print('merging score and price')
        price, price_forward = get_price(metal_path, metal_columns, period)
        price_sentiment = price_forward.merge(sentiment, left_on='Index', right_on='date',how='inner')
        price_sentiment.drop(['Index','title','{}_fact'.format(mf),'{}_action'.format(mf),'news type','Sentiment'],axis=1,inplace = True)
        price_sentiment = get_discrete_score(price_sentiment)
        price_sentiment = price_sentiment.dropna()
        print('the original length of the dataframe : {}'.format(len(price_sentiment)))
        price_sentiment.drop_duplicates(keep='first', inplace=True)
        print('the processsed length of the dataframe : {}'.format(len(price_sentiment)))
        
        for threshold in threshold_lst:
            
            score = Score(met, build_db_accur = True, build_db_sent = True)
            
            error = []
            total_df = []
            for idx in tqdm(price_sentiment.index, desc='calculating the accur of {}'.format(met)):
                row = price_sentiment.loc[idx]
                try:
                    tmp_df = score.update_accur([row['url']] * 7, 
                                                [row['date']] * 7,
                                                [row['company']] * 7,
                                                [1, 3, 5, 7, 10, 15, 20],
                                                [row['Sentiment_article']] * 7,
                                                list(row[['discrete_1d', 'discrete_3d', 'discrete_5d','discrete_7d', 'discrete_10d', 'discrete_15d', 'discrete_20d']]))
                    total_df.append(tmp_df)
                except Exception as e:
                    error.append(e)
            print(error)
            df = pd.concat(total_df).reset_index(drop=True)
            df.to_csv(accur_path + '{}_accur.csv'.format(met), index=False)
            
            date_lst = list(price_sentiment['date'].unique())
            total_df = []
            
            with tqdm(date_lst[:], desc='getting the score for raw mean and std') as t:
                try:
                    for current_date in t:
                        current_rec = price_sentiment[price_sentiment['date']==current_date]
                        for horizon in [1,3,5,7,10,15,20]:
                            tmp_df = score.cal_score(current_rec['company'],current_rec['Sentiment_article'],current_date,20,horizon, threshold = threshold)
                            total_df.append(tmp_df)

                except KeyboardInterrupt:
                    t.close()
                    raise
                t.close()
            df = pd.concat(total_df)
            df.to_csv(score_path + '{}_score.csv'.format(met), index=False)
            
            adjust_mean, adjust_std = score.compute_mean_std()
            
            total_df = []
            with tqdm(date_lst[:], desc='getting score for adjusted mean and std') as t:
                try:
                    for current_date in t:
                        current_rec = price_sentiment[price_sentiment['date']==current_date]
                        for horizon in [1,3,5,7,10,15,20]:
                            tmp_df = score.cal_score(current_rec['company'],current_rec['Sentiment_article'],current_date,20,horizon, adjust_mean, adjust_std, threshold = threshold)
                            total_df.append(tmp_df)
                except KeyboardInterrupt:
                    t.close()
                    raise
                t.close()
            df = pd.concat(total_df)
            df.to_csv(score_path + '{}_score.csv'.format(met), index=False)
                
            result = pd.read_csv(score_path + '{}_score.csv'.format(metal))
            result = result.dropna()
            result['date'] = pd.to_datetime(result['date'])
            price['Index'] = pd.to_datetime(price['Index'])
            combine_df = pd.merge(result, price, left_on='date', right_on='Index')
            
            accuracy_lst = []
            for i in [1,3,5,7,10,15,20]:
                tmp_df = combine_df[combine_df['horizon'] == i]

                y_true = tmp_df['discrete_score']
                y_pred = tmp_df['return_{}d'.format(i)].apply(lambda x : give_sign(x))
                acc = accuracy_score(y_true=y_true, y_pred=y_pred)
                accuracy_lst.append(acc)
                print(acc)
            avg_acc = np.mean(accuracy_lst)
            
            print('period : {}, threshold : {}, avg_acc : {}'.format(period, threshold, avg_acc))
            
            print('#############################################################################')


current metal is : Cu
getting sentiment score
merging score and price
the original length of the dataframe : 10438
the processsed length of the dataframe : 10391


calculating the accur of Cu: 100%|██████| 10391/10391 [01:40<00:00, 103.71it/s]


[]


getting the score for raw mean and std:  90%|▉| 1705/1887 [1:30:15<1:06:51, 22.04s/it]  

In [None]:
result

In [None]:
accuracy_lst

In [None]:
df = df.dropna()

In [None]:
df