In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Load Libraries**

In [None]:
#!pip install yahoofinancials
#!pip install yfinance
#!pip install vaderSentiment
#!pip install --upgrade vaderSentiment
#!pip install catboost


import os
import pandas as pd
from pandas import option_context
import numpy as np
from pathlib import Path
import xml.etree.ElementTree as ET
import datetime
from pandas.tseries.offsets import BDay

import glob

from io import StringIO
import re
import string
from unidecode import unidecode
from collections import Counter
from string import digits
from copy import deepcopy

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from yahoofinancials import YahooFinancials
import yfinance as yf


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import catboost as ctb
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle 
from sklearn.model_selection import GridSearchCV



#Files pathes

import_path='/content/drive/MyDrive/Seminar /Test Folder/sample'
export_path='/content/drive/MyDrive/Seminar /Test Folder/output'
export_path_body='/content/drive/MyDrive/Seminar /Test Folder/body'
export_path_speakers='/content/drive/MyDrive/Seminar /Test Folder/speakers'

# **Extract Features from XML File**

In [None]:
def extract_file(import_path,export_path): 
    
    os.chdir(import_path)
    os.getcwd()
    
    files=[]
    for child in Path('.').iterdir():
        if child.is_file():
            files.append(child.name)
    texts=[]
    for file in files:
        tree = ET.parse(file)
        root = tree.getroot()
        for child in root:
            texts.append(child.text)   

    np_array=np.asarray(texts)
    reshaped_array =np_array.transpose()
    reshaped_array=reshaped_array.reshape(len(files),6 )


    df = pd.DataFrame(reshaped_array , columns=[
     'EventStory',
     'eventTitle',
     'city',
     'companyName',
     'companyTicker',
     'startDate'])
    #clean the column from newline symbol
    df = df.replace(r'\n',' ', regex=True) 
    df['FileNo']=files
    df=df[['FileNo','companyName','companyTicker','startDate']]
    os.chdir(export_path)
    df.to_csv('general.csv', index=False)
 #  filename=os.path.basename(f'{Path(export_path).name}.csv')
    filename = os.listdir()
                              

    return filename[0]

In [None]:
filename=extract_file(import_path,export_path)
filename

'general.csv'

# **Preprocess Tickers**

In [None]:
def check_tickers(tickers_data_no_dups):

    is_in_yf = list()

    for row_ind in tickers_data_no_dups.index:
        
        print(row_ind)
        ticker = tickers_data_no_dups.loc[row_ind, 'companyTicker']
        start_date = str(tickers_data_no_dups.loc[row_ind, 'startDate']).split()[0]
        end_date = str(tickers_data_no_dups.loc[row_ind, 'EndDate']).split()[0]
        print(ticker)

        try:
            yahoo_financials = YahooFinancials(ticker)
            historical_stock_prices = yahoo_financials.get_historical_price_data(f'{start_date}', f'{end_date}', 'daily')
            print(historical_stock_prices[ticker])
            stock_prices = historical_stock_prices[ticker]['prices']
            is_in_yf.append(True)
            
        except:
            
            is_in_yf.append(False)
            
    return is_in_yf

In [None]:
def extract_tickers(export_path,filename):
    
    os.chdir(export_path)
    summary_df=pd.read_csv(filename)
    summary_df=summary_df.reset_index()
    #split the date from startdate column
    date=summary_df['startDate'].str.split(expand=True)[0]
    #change the format of the date '%Y-%m-%d'
    summary_df['startDate']=pd.to_datetime(date,format='%d-%b-%y')
    summary=summary_df[['FileNo','companyName','companyTicker','startDate']]
    
    
    #add 5businessday BDay() to the start date to calculate the return 
    summary['EndDate']=summary['startDate']+ BDay(5)
    #drop all the tickers with (.)
    summary_files_usa = summary[summary['companyTicker'].str.contains('.', regex=False)==False]
    #drop all the null values in companyTicker column
    summary_files_usa = summary_files_usa.dropna(subset=['companyTicker'])
    summary_files_usa = summary_files_usa[summary_files_usa['companyTicker']!='']
    #drop duplicates
    american_tickers_list = summary_files_usa['companyTicker'].drop_duplicates()
    tickers_data_no_dups = summary_files_usa[['companyTicker', 'startDate']]
    tickers_data_no_dups = summary_files_usa.drop_duplicates(subset=['companyTicker'])
    
    #check if the tickers exist in yahoo finance 
    is_in_yf=check_tickers(tickers_data_no_dups)
    tickers_data_no_dups['is_in_yf']= np.array(is_in_yf)
    tickers_data_no_dups.to_csv(f'{Path(export_path).name}_valid_tickers.csv', index=False)
    tickers_filename=os.path.basename(f'{Path(export_path).name}_valid_tickers.csv')
    
    
    df_relevant_tickers=pd.read_csv(tickers_filename,index_col=0)
    
    #to use only the tickers that we could find in yf
    df_relevant_tickers = df_relevant_tickers[df_relevant_tickers['is_in_yf']==True]
    df_relevant_tickers['ones'] = 1
    df_relevant_tickers = df_relevant_tickers[['companyTicker', 'ones']]
    df_relevant_tickers = df_relevant_tickers.merge(summary_df, left_on='companyTicker', right_on='companyTicker')
    df_relevant_tickers.drop('ones', axis=1, inplace=True)
    df_relevant_tickers['startDate']=df_relevant_tickers['startDate'].astype(str)
    date=df_relevant_tickers['startDate'].str.split(expand=True)[0]
    #change the format of the date '%Y-%m-%d'
    df_relevant_tickers['startDate']=pd.to_datetime(date,format='%Y-%m-%d')
    df_relevant_tickers['EndDate']=df_relevant_tickers['startDate']+ BDay(5)
    
    df_relevant_tickers.to_csv(f'{Path(export_path).name}_new.csv')
    valid_tickers_filename=os.path.basename(f'{Path(export_path).name}_new.csv')    
    
    files=df_relevant_tickers['FileNo'].tolist()
    
    return files,valid_tickers_filename 

In [None]:
files,valid_tickers_filename =extract_tickers(export_path,filename)

0
BLK
{'eventsData': {}, 'firstTradeDate': {'formatted_date': '1999-10-01', 'date': 938784600}, 'currency': 'USD', 'instrumentType': 'EQUITY', 'timeZone': {'gmtOffset': -14400}, 'prices': [{'date': 1589895000, 'high': 515.5800170898438, 'low': 498.8299865722656, 'open': 512.4600219726562, 'close': 501.20001220703125, 'volume': 2099700, 'adjclose': 476.2796936035156, 'formatted_date': '2020-05-19'}, {'date': 1589981400, 'high': 513.0, 'low': 503.4599914550781, 'open': 509.55999755859375, 'close': 508.739990234375, 'volume': 1496400, 'adjclose': 483.44476318359375, 'formatted_date': '2020-05-20'}, {'date': 1590067800, 'high': 509.6300048828125, 'low': 504.1499938964844, 'open': 508.0799865722656, 'close': 508.510009765625, 'volume': 1007600, 'adjclose': 483.2262268066406, 'formatted_date': '2020-05-21'}, {'date': 1590154200, 'high': 513.5700073242188, 'low': 503.0, 'open': 506.0, 'close': 513.2999877929688, 'volume': 632700, 'adjclose': 487.77801513671875, 'formatted_date': '2020-05-22'}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
valid_tickers_filename

'Output_new.csv'

# **Obtain closing price from yf and calculate the return**

In [None]:
def get_return(export_path,valid_tickers_filename):
    
    os.chdir(export_path)
    os.getcwd()
    
    df_relevant_tickers=pd.read_csv(valid_tickers_filename,index_col=0)
    df_relevant_tickers=df_relevant_tickers.reset_index()
    stock_dif = []

    for row_ind in df_relevant_tickers.index:

        print(row_ind)

        ticker = df_relevant_tickers.loc[row_ind, 'companyTicker']
        ticker = ticker.split('.')[0]
        print(ticker)
        start_date = str(df_relevant_tickers.loc[row_ind, 'startDate']).split()[0]
        print(start_date)
        end_date = str(df_relevant_tickers.loc[row_ind, 'EndDate']).split()[0]
        print(end_date)
        try:

            yahoo_financials = YahooFinancials(ticker)
            historical_stock_prices = yahoo_financials.get_historical_price_data(start_date, end_date, 'daily') 
            stock_prices = historical_stock_prices[ticker]['prices']
            print(stock_prices)

            try:

                start_price = stock_prices[0]
                end_price = stock_prices[1]
                stock_dif.append((end_price['adjclose']-start_price['adjclose'])/(start_price['adjclose']))
            except:

                stock_dif.append(None)


        except:
            print('error')
            stock_dif.append(None)
            
    df_relevant_tickers['return'] = np.array(stock_dif)
    
    df_relevant_tickers.to_csv(f'{Path(export_path).name}_return.csv')
    
    yield_filename=os.path.basename(f'{Path(export_path).name}_return.csv')

    return  yield_filename


In [None]:
 yield_filename=get_return(export_path,valid_tickers_filename)

0
BLK
2020-05-19
2020-05-26
[{'date': 1589895000, 'high': 515.5800170898438, 'low': 498.8299865722656, 'open': 512.4600219726562, 'close': 501.20001220703125, 'volume': 2099700, 'adjclose': 476.2796325683594, 'formatted_date': '2020-05-19'}, {'date': 1589981400, 'high': 513.0, 'low': 503.4599914550781, 'open': 509.55999755859375, 'close': 508.739990234375, 'volume': 1496400, 'adjclose': 483.4447326660156, 'formatted_date': '2020-05-20'}, {'date': 1590067800, 'high': 509.6300048828125, 'low': 504.1499938964844, 'open': 508.0799865722656, 'close': 508.510009765625, 'volume': 1007600, 'adjclose': 483.2262268066406, 'formatted_date': '2020-05-21'}, {'date': 1590154200, 'high': 513.5700073242188, 'low': 503.0, 'open': 506.0, 'close': 513.2999877929688, 'volume': 632700, 'adjclose': 487.77801513671875, 'formatted_date': '2020-05-22'}]
1
CCL
2020-06-08
2020-06-15
[{'date': 1591623000, 'high': 25.280000686645508, 'low': 23.40999984741211, 'open': 24.56999969482422, 'close': 24.90999984741211, 

# **Sentiment Analysis Models: Vader and Finbert**

In [None]:
def vader_senti(List):
    vs={}
    analyzer = SentimentIntensityAnalyzer()
    for sentence in List :
        vs[sentence]= analyzer.polarity_scores(sentence)
    df = pd.DataFrame.from_dict(vs,orient ='index')
    df = df.reset_index(False)
    
    score = df["compound"].values
    sentiment = []
    for i in score:

        if i > 0.20 :
            sentiment.append('Positive')
        elif i <= -0.2 :
            sentiment.append('Negative')
        else:
            sentiment.append('Neutral')

    df["Sentiment"] = sentiment
    
       
    return df["Sentiment"]

In [None]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
labels = {0:'Neutral', 1:'Positive',2:'Negative'}



def finbert_senti(X):
    
    sent_val = list()
    for x in X:
        inputs = tokenizer(x, return_tensors="pt", padding=True)
        outputs = finbert(**inputs)[0]
        val = labels[np.argmax(outputs.detach().numpy())]
        sent_val.append(val)
        
    return sent_val

# **Cleaning and Splitting Text**

In [None]:
reSentenceSplitter = re.compile(
    "(?i:\\bYum!\\s*(?=Brands?\\b))|" +
    "(?i:\\b(?:inc|corp|ltd)\\.\\s*\\()|" +
    "(?i:\\b(?:mrs|messrs|sen|esq|adv|prof|rev|gov|gen|rep|hon|adj|oblig|tbk|dev|inv|invs|opn|constr|conces|(?-i:Med|Cap|Develop|Met))\\.\\s*(?=[A-Z'\"\\(]))|" +
    "\\b(?i:[a-z][a-z][a-z]+(?:'t)?|" +
    "(?-i:we|us|in|to|at|of|on|as|be|by|do|go|he|is|it|me|so|up|no|US|UK|EU|MW|)|" +
    "'s|" +
    "[A-Z]&(?:amp;)?[A-Z]+|" +
    "\\s*\\([A-Z]+\\)|" +
    "\\d+[%$sx]?|" +
    "\\)(?=\\.)|" +
    "[A-Z]+\\d+)" +
    "[\"'\\)]*[!\\.\\?]+[\"'\\)]*( +)" +
    "(?=" + "(?:<[^\\>][^>]*>)*" +
    "(?:[\"']+(?:<[^\\>][^>]*>)*)?" +
    "\\(?" +
    "(?:[A-Z][a-z]|Q[1-4]|\\d+[A-Z]+ |[AI] |I'|U\\.S\\b|[A-Z]+\\b|(?:19|20)\\d\\d ))") 

def NormalizeString(s, doUnescape=False):
 
    filler_words=['Hello','hello','Thanks','Thank you','thank you','good morning', 'good morning','everyone','Good morning','Good day','good day',
                  'bye','Bye','hey','Hi','welcome','Welcome','Good afternoon','Good evening','good afternoon','good evening','yea','yeah',"We'll talk to you soon.","We'll talk to you soon.","ladies and gentlemen",'Have a good day.','Have a nice day','Have a','Okay']
    s = re.sub(r'\[\[.*?\]\]', '', s)
    s = unidecode(s)
    s = re.sub(r'[<{]', '[', s)
    s = re.sub(r'[>}]', ']', s)
    s = s.replace('(', '')
    s = s.replace(')', '')
    s = s.replace(' .', '.')
    s = s.replace('*', '')
    s = s.replace('=','')
    s = s.replace('-','')    
    s=re.sub("[\(\[].*?[\)\]]", "", s)
    s = s.replace('[]','')
    for word in filler_words:
         s = s.replace(word, '')
    return re.sub(r'\s+', ' ', s).strip()


def GetSentences(text):
    text = text.replace('\n\n', '\n')
    sentences = []
    for paragraph in text.split('\n'):
        paragraph = paragraph.strip()
        if not paragraph: continue
        paragraph = NormalizeString(paragraph)
        iAt = 0
        for M in reSentenceSplitter.finditer(paragraph):
            if not M.group(1): continue
            sent = paragraph[iAt: M.start(1)].strip()
            iAt = M.end(1)
            if not sent: continue
            sentences.append(sent)
        sent = paragraph[iAt:].strip()
        if not sent: continue
        sentences.append(sent)
    return sentences

# **Divide each text to two parts: Presentation and Q&A**

In [None]:
def text_divisions(df):  
    

    try:
        
        presentation=df[df['Text']=='Presentation'].index

        try:
            
            QA=df[df['Text']=='Questions and Answers'].index
            presentation_df=df.iloc[presentation[0]+1:QA[0]]
            presentation_df.columns=['Presentation']
            QA_df=df.iloc[QA[0]+1:]
            QA_df.columns=['Questions and Answers']
            presentation_df.reset_index(drop=True, inplace=True)
            QA_df.reset_index(drop=True, inplace=True)
            dfnew = pd.concat([presentation_df, QA_df],axis=1)
            dfnew=dfnew[['Presentation','Questions and Answers']]

        except:

            presentation_df=df.iloc[presentation[0]+1:]
            presentation_df.columns=['Presentation']
            dfnew=presentation_df[['Presentation']]


    except:

        QA=df[df['Text']=='Questions and Answers'].index
        QA_df=df.iloc[QA[0]+1:]
        QA_df.columns=['Questions and Answers']
        dfnew=QA_df[['Questions and Answers']]        


    dfnew=dfnew.fillna("")
        
    return dfnew

# **Divide the text according to  participants: corporate and conference participants**

In [None]:
def speakers_lists(text,files):
    
    corp_idx = text.find('Corporate Participants')+len('Corporate Participants\n')
    corp_idx_end = text.find('\nConference Call Participants')
    conf_idx = text.find('Conference Call Participants') + len('Conference Call Participants\n')
    conf_idx_end = text.find('Presentation')
    corp_txt = text[corp_idx:corp_idx_end].split('\n')
    comp_name=[]
    for file in files:
        tree = ET.parse(file)
        root = tree.getroot()
        for elem in root.iter('companyName'):
            comp_name.append(elem.text)
    comp_name=' '.join(comp_name)
    
    corp_name = [x.split(comp_name.split()[0])[0] for x in corp_txt]
    corp_name = [x.strip() for x in corp_name if x.strip() != '']
    corp_speaker1 = corp_name     # for cases where first name, initial. last name
    corp_speaker2 = [x.split()[0] + ' ' + x.split()[-1] for x in corp_name]
    corp_speaker3=['Unidentified Company Representative']
    
    
    
    conf_txt = text[conf_idx:conf_idx_end].split('\n')
    conf_speaker1 = [x.split()[0] + ' ' + x.split()[1] for x in conf_txt if len(x.split()) >= 2]
    conf_speaker2 = [x.split()[0] + ' ' + x.split()[2] for x in conf_txt if len(x.split()) >= 3]    # accounts for speakers with middle name
    conf_speaker3 = [x.split()[0] + ' ' + x.split()[1] + ' ' + x.split()[2] for x in conf_txt if len(x.split()) >=3]    # accounts for speakers with middle name
    conf_speaker4= ['Unidentified Participant']+['Unidentified Analyst']+['Unidentified Audience Member']
    
    speaker_list = corp_speaker1 + corp_speaker2 + corp_speaker3+conf_speaker1 + conf_speaker2 + conf_speaker3 + conf_speaker4
    
    columns = ['corp_particip', 'conf_particip']
    df_speaker = pd.DataFrame(columns=columns)
    corp_list = corp_speaker1 + corp_speaker2  + corp_speaker3
    conf_list = conf_speaker1 + conf_speaker2 + conf_speaker3 + conf_speaker4
    

    df_speaker['corp_particip'] = corp_list + (max(len(corp_list), len(conf_list)) -len(corp_list)) * ['']
    df_speaker['conf_particip'] = conf_list + (max(len(corp_list), len(conf_list)) -len(conf_list)) * ['']
        

    list1=[NormalizeString(s, doUnescape=False) for s in df_speaker['corp_particip']]
    list1=[ele for ele in list1 if ele.strip()]
    list2=[NormalizeString(s, doUnescape=False) for s in df_speaker['conf_particip']]
    list2=[ele for ele in list2 if ele.strip()]
    
    
    list1[:] = (value for value in list1 if value != 'Unidentified Audience Member')
    list1[:] = (value for value in list1 if value != 'Unidentified Analyst')
    list1[:] = (value for value in list1 if value != 'Unidentified Participant')
    
            
    return list1,list2

In [None]:
def split_speakers(body,files):
    
    txt=' '.join(body)
    
    corp_particip,conf_particip=speakers_lists(txt,files)
    
    body = body[0]
    body_separated_list = body.split('--------------------------------------------------------------------------------')

    speakers_df = pd.DataFrame(index=list(range(len(body_separated_list)//2)), columns=['speaker', 'Text', 'position', 'is_corp'])

    for i in range(1, len(body_separated_list), 2):
        speakers_df.iloc[i//2]= np.array([body_separated_list[i], body_separated_list[i+1], False, '0x'])

    speakers_df = speakers_df.replace(r'\n',' ', regex=True) 

    
    s_list_speaker = speakers_df['speaker'].tolist()
    s_list_content = speakers_df['Text'].tolist()

    res_speaker = [NormalizeString(s, doUnescape=False) for s in s_list_speaker]
    res_content = [NormalizeString(s, doUnescape=False) for s in s_list_content]

    speakers_df['speaker'] = np.array(res_speaker)
    speakers_df['Text'] = np.array(res_content)
    
    speakers_df['speaker_name'] = speakers_df['speaker'].str.split(',', expand=True)[0]
    speakers_df['is_corp'] = speakers_df['speaker_name'].isin(corp_particip)


    try:
        
        speakers_df['position'] = speakers_df['speaker'].str.split(',', expand=True)[1]
        
    except:
        
        speakers_df['position']=''.join(['Unidentified'])

    
    speakers_df.columns=['speaker_details','Text','speaker_position','is_corp','speaker_name']
    speakers_df = speakers_df[['speaker_details', 'speaker_name', 'speaker_position', 'is_corp','Text']]
    
    return speakers_df

In [None]:
def split_speakers_text(df,filename):
    
        
    df_sentences_speakers = pd.DataFrame(columns=list(df.columns))
    
    for speaker in df.index:

        tmp = deepcopy(df.loc[speaker])
    
        try:
            tmp_sents = GetSentences(tmp['Text'])
            tmp_df = pd.DataFrame(columns=df_sentences_speakers.columns)
            tmp_df['Text'] = tmp_sents
            tmp_df['speaker_details'] = tmp['speaker_details']
            tmp_df['speaker_name'] = tmp['speaker_name']
            tmp_df['speaker_position'] = tmp['speaker_position']
            tmp_df['is_corp'] = tmp['is_corp']
            df_sentences_speakers = pd.concat([df_sentences_speakers, tmp_df])


        except:

            df_list=df_sentences_speakers['Text'].tolist()
            df_sentences_speakers['Finbert'] = finbert_senti(df_list)
            df_sentences_speakers['compound'] = vader_senti(df_list)
            df_sentences_speakers.reset_index(drop=True, inplace=True)
   




    df_list=df_sentences_speakers['Text'].tolist()
    df_sentences_speakers['Finbert'] = finbert_senti(df_list)
    df_sentences_speakers['compound'] = vader_senti(df_list)
    df_sentences_speakers.reset_index(drop=True, inplace=True)
   

    return df_sentences_speakers


# **Calling for  all the previous functions**

In [None]:
def split_clean_speakers_divisions_scores(import_path,export_path_body,export_path_speakers,files):

    os.chdir(import_path)
    os.getcwd()

    
#if all the ticker|s work, we can use the fellowing code, no need to pass a list of files 

#     files=[]
#     for child in Path('.').iterdir():
#         if child.is_file():
#             files.append(child.name)

    for file in files:
        print(file,'start')
        tree = ET.parse(file)
        root = tree.getroot()
        for elem in root.iter('Body'):
            bname = os.path.basename(file).replace('.xml', '')
            filename = os.path.join(f'{export_path_speakers}', bname)
            speakers=split_speakers([elem.text],files)
            df=split_speakers_text(speakers,filename)
            df.to_csv(f'{filename}.csv')
            
            
##############################################################################################

            text=GetSentences(elem.text)
            np_array=np.asarray(text)
            reshaped_array=np_array.transpose()
            filename = os.path.join(f'{export_path_body}', bname)
            df = pd.DataFrame (reshaped_array, columns = ['Text'])
            df=text_divisions(df)
            
            try:
                pres_list=df['Presentation'].tolist()
                
                try:
                    QA_list=df['Questions and Answers'].tolist()
                    df['Pres_Vader']=vader_senti(pres_list)
                    df['Pres_Finbert']=finbert_senti(pres_list)
                    df['QA_Vader']=vader_senti(QA_list)
                    df['QA_Finbert']=finbert_senti(QA_list)
                    df=df[['Presentation','Pres_Vader','Pres_Finbert','Questions and Answers','QA_Vader','QA_Finbert']]
                       
                    
                except:   
                    
                    df['Pres_Vader']=vader_senti(pres_list)
                    df['Pres_Finbert']=finbert_senti(pres_list)
                    df=df[['Presentation','Pres_Vader','Pres_Finbert']]
                    
            except:
                
                QA_list=df['Questions and Answers'].tolist()
                df['QA_Vader']=vader_senti(QA_list)
                df['QA_Finbert']=finbert_senti(QA_list)
                df=df[['Questions and Answers','QA_Vader','QA_Finbert']]
                             

            
            df.to_csv(f'{filename}.csv')
            print(file,'end')
            
            

In [None]:
split_clean_speakers_divisions_scores(import_path,export_path_body,export_path_speakers,files)

13105267_T.xml start
13105267_T.xml end
12958669_T.xml start
12958669_T.xml end


# **Participants Features**

In [None]:
def positive(val):
    return val[val=='Positive'].sum()
def negative(val):
    return val[val=='Negative'].sum() 

In [None]:
def speaker_features(export_path_speakers):


    os.chdir(export_path_speakers)
    file_name = os.listdir()
    len(file_name)

    df=None
    features=[]

    for file in file_name:
        print(file)
        files = glob.glob(file)
        df_new = None
        for i, f in enumerate (files):
        #to add the file name as column    
            if i == 0:
                df_new = pd.read_csv(f,index_col=0) 
                file_name_noext=f.replace('.csv','.xml') 
                df_new['fname'] = file_name_noext
            else:
                tmp = read_csv(f)
                tmp['fname'] = f
                df_new = df_new.append(tmp)  

            df=df_new


            try:

                groupRes=df.groupby(df['is_corp']).get_group(True)

    #         #####################################################################################################

                try:

                    pos_v=groupRes[groupRes['compound']=='Positive']['compound'].count()
                    neg_v=groupRes[groupRes['compound']=='Negative']['compound'].count()
                    print('negatives:',neg_v,'\npositives:',pos_v)
                    # #new feature:
                    corp_vader=(pos_v-neg_v)/(pos_v+neg_v+1)
                    print('Corp Vader feature:',corp_vader)

                    neg_f=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
                    pos_f=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()
                    print('negatives:',neg_f,'\npositives:',pos_f)

                    #new feature:
                    corp_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)
                    print('Corp Finbert feature:',corp_finbert)


    ###############################################################################################                
                    groupRes=df.groupby(df['is_corp']).get_group(False)

                    neg_v1=groupRes[groupRes['compound']=='Positive']['compound'].count()
                    pos_v1=groupRes[groupRes['compound']=='Negative']['compound'].count()
                    print('negatives:',neg_v1,'\npositives:',pos_v1)

                    #new feature:
                    conf_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)
                    print('Conf Vader feature:',conf_vader)


                    neg_f1=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
                    pos_f1=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()
                    print('negatives:',neg_f1,'\npositives:',pos_f1)

                    #new feature:
                    conf_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)
                    print('Conf Finbert feature:',conf_finbert)
    #####################################################################################################            

                    features_df_speakers=pd.DataFrame(columns=['FileNo','CV','CF','CNV','CNF'])
                    features_df_speakers['FileNo']=df[['fname']].head(1)
                    features_df_speakers['CV']=corp_vader
                    features_df_speakers['CF']=corp_finbert
                    features_df_speakers['CNV']=conf_vader
                    features_df_speakers['CNF']=conf_finbert
                    features.append(features_df_speakers)



                except:


                    pos_v=groupRes[groupRes['compound']=='Positive']['compound'].count()
                    neg_v=groupRes[groupRes['compound']=='Negative']['compound'].count()
                    print('negatives:',neg_v,'\npositives:',pos_v)
                    # #new feature:
                    corp_vader=(pos_v-neg_v)/(pos_v+neg_v+1)
                    print('Corp Vader feature:',corp_vader)

                    neg_f=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
                    pos_f=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()
                    print('negatives:',neg_f,'\npositives:',pos_f)

                    #new feature:
                    corp_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)
                    print('Corp Finbert feature:',corp_finbert)

                    features_df_speakers=pd.DataFrame(columns=['FileNo','CV','CF'])
                    features_df_speakers['FileNo']=df[['fname']].head(1)
                    features_df_speakers['CV']=corp_vader
                    features_df_speakers['CF']=corp_finbert
                    features.append(features_df_speakers)
                    print('file:',file, ' dosent have conference participants')



            except:

                print('file:',file, ' dosent have corporate participants')

                groupRes=df.groupby(df['is_corp']).get_group(False)

                print('error')

                neg_v1=groupRes[groupRes['compound']=='Positive']['compound'].count()
                pos_v1=groupRes[groupRes['compound']=='Negative']['compound'].count()
                print('negatives:',neg_v1,'\npositives:',pos_v1)

                #new feature:
                conf_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)
                print('Conf Vader feature:',conf_vader)


                neg_f1=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
                pos_f1=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()
                print('negatives:',neg_f1,'\npositives:',pos_f1)

                #new feature:
                conf_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)
                print('Conf Finbert feature:',conf_finbert)

                features_df_speakers=pd.DataFrame(columns=['FileNo','CNV','CNF'])
                features_df_speakers['FileNo']=df[['fname']].head(1)
                features_df_speakers['CNV']=conf_vader
                features_df_speakers['CNF']=conf_finbert
                features.append(features_df_speakers)










    df = pd.concat(features)
    os.chdir(export_path)
    df.to_csv('Speakers_Features.csv')
    
    return df

In [None]:
df_speaker_features=speaker_features(export_path_speakers)

13197438_T.csv
file: 13197438_T.csv  dosent have corporate participants
error
negatives: 245 
positives: 55
Conf Vader feature: -0.6312292358803987
negatives: 80 
positives: 103
Conf Finbert feature: 0.125
13215635_T.csv
negatives: 8 
positives: 90
Corp Vader feature: 0.8282828282828283
negatives: 11 
positives: 20
Corp Finbert feature: 0.28125
negatives: 25 
positives: 1
Conf Vader feature: -0.8888888888888888
negatives: 3 
positives: 3
Conf Finbert feature: 0.0
12958669_T.csv
negatives: 7 
positives: 191
Corp Vader feature: 0.9246231155778895
negatives: 63 
positives: 109
Corp Finbert feature: 0.2658959537572254
negatives: 54 
positives: 0
Conf Vader feature: -0.9818181818181818
negatives: 14 
positives: 14
Conf Finbert feature: 0.0
13105267_T.csv
negatives: 0 
positives: 31
Corp Vader feature: 0.96875
negatives: 1 
positives: 0
Corp Finbert feature: -0.5
negatives: 0 
positives: 31
Corp Vader feature: 0.96875
negatives: 1 
positives: 0
Corp Finbert feature: -0.5
file: 13105267_T.csv

# **Body Features**

In [None]:
def body_features(export_path_body):
    
    os.chdir(export_path_body)
    file_name = os.listdir()


    df=None
    features=[]

    for file in file_name:
        print(file)
        files = glob.glob(file)

        df_new = None

        for i, f in enumerate (files):

            if i == 0:

                df_new = pd.read_csv(f,index_col=0)
                file_name_noext=f.replace('.csv', '.xml') 
                df_new['fname'] = file_name_noext

            else:
                tmp = read_csv(f)
                tmp['fname'] = f
                df_new = df_new.append(tmp)  

            df=df_new

            try:

                groupRes=df.groupby(df['Presentation'])


                try:


                    tmp=groupRes['Pres_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

                    neg_v=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
                    pos_v=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()
                    print('negatives:',neg_v,'\npositives:',pos_v)

                    #new feature:
                    pres_vader=(pos_v-neg_v)/(pos_v+neg_v+1)
                    print('Presentation Vader feature:',pres_vader)

                    tmp=groupRes['Pres_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

                    neg_f=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
                    pos_f=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()
                    print('negatives:',neg_f,'\npositives:',pos_f)

                    #new feature:
                    pres_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)
                    print('Presentation Finbert feature:',pres_finbert)


                    #################################################
                    groupRes1=df.groupby(df['Questions and Answers'])
                    tmp=groupRes['QA_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

                    neg_v1=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
                    pos_v1=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()
                    print('negatives:',neg_v1,'\npositives:',pos_v1)
                    #new feature:
                    QA_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)
                    print('Q&A Vader feature:',QA_vader)

                    tmp=groupRes['QA_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

                    neg_f1=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
                    pos_f1=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()
                    print('negatives:',neg_f1,'\npositives:',pos_f1)
                    #new feature:
                    QA_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)
                    print('QA Finbert feature:',QA_finbert)


                    features_df=pd.DataFrame(columns=['FileNo','PV','PF','QAV','QAF'])
                    features_df['FileNo']= df[['fname']].head(1)
                    features_df['PV']=pres_vader
                    features_df['PF']=pres_finbert
                    features_df['QAV']=QA_vader
                    features_df['QAF']=QA_finbert


                except:

                    print('file:',file,'doesnt have Q&A part')

                    tmp=groupRes['Pres_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

                    neg_v=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
                    pos_v=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()
                    print('negatives:',neg_v,'\npositives:',pos_v)

                    #new feature:
                    pres_vader=(pos_v-neg_v)/(pos_v+neg_v+1)
                    print('Presentation Vader feature:',pres_vader)

                    tmp=groupRes['Pres_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

                    neg_f=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
                    pos_f=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()
                    print('negatives:',neg_f,'\npositives:',pos_f)

                    #new feature:
                    pres_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)
                    print('Presentation Finbert feature:',pres_finbert)



                    features_df=pd.DataFrame(columns=['FileNo','PV','PF'])
                    features_df['FileNo']= df[['fname']].head(1)
                    features_df['PV']=pres_vader
                    features_df['PF']=pres_finbert




    # #         #######################################################################################

            except:

                print('file:',file,'doesnt have Presentation part')


                groupRes=df.groupby(df['Questions and Answers'])
                tmp=groupRes['QA_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

                neg_v1=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
                pos_v1=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()
                print('negatives:',neg_v1,'\npositives:',pos_v1)
                #new feature:
                QA_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)
                print('Q&A Vader feature:',QA_vader)

                tmp=groupRes['QA_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

                neg_f1=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
                pos_f1=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()
                print('negatives:',neg_f1,'\npositives:',pos_f1)
                #new feature:
                QA_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)
                print('QA Finbert feature:',QA_finbert)


                features_df=pd.DataFrame(columns=['FileNo','QAV','QAF'])
                features_df['FileNo']= df[['fname']].head(1)
                features_df['QAV']=QA_vader
                features_df['QAF']=QA_finbert

            features.append(features_df)



    df = pd.concat(features)
    os.chdir(export_path)
    df.to_csv('Body_Features.csv')
    
    return df


In [None]:
df_body_features=body_features(export_path_body)

13197438_T.csv
negatives: 0 
positives: 3
Presentation Vader feature: 0.75
negatives: 0 
positives: 2
Presentation Finbert feature: 0.6666666666666666
negatives: 1 
positives: 1
Q&A Vader feature: 0.0
negatives: 0 
positives: 0
QA Finbert feature: 0.0
13215635_T.csv
negatives: 0 
positives: 2
Presentation Vader feature: 0.6666666666666666
negatives: 0 
positives: 0
Presentation Finbert feature: 0.0
negatives: 0 
positives: 0
Q&A Vader feature: 0.0
negatives: 0 
positives: 0
QA Finbert feature: 0.0
12958669_T.csv
negatives: 8 
positives: 73
Presentation Vader feature: 0.7926829268292683
negatives: 35 
positives: 60
Presentation Finbert feature: 0.2604166666666667
negatives: 6 
positives: 65
Q&A Vader feature: 0.8194444444444444
negatives: 18 
positives: 18
QA Finbert feature: 0.0
13105267_T.csv
negatives: 1 
positives: 22
Presentation Vader feature: 0.875
negatives: 1 
positives: 0
Presentation Finbert feature: -0.5
file: 13105267_T.csv doesnt have Q&A part
negatives: 1 
positives: 22
P

# **Merge all the files (participant features,body features , return)**

In [None]:
def merge(export_path,df_body_features,df_speaker_features,yield_filename):

  os.chdir(export_path)
  # body=pd.read_csv('Body_Features2019.csv',index_col=0)
  # speakers=pd.read_csv('Speakers_Features2019.csv',index_col=0)
  Return=pd.read_csv(yield_filename,index_col=0)
  features=pd.merge(df_body_features,df_speaker_features , on = "FileNo", how = "right")
  final=pd.merge(features,Return , on = "FileNo", how = "right")
  final=final[['FileNo','CV','CF','CNV','CNF','PV','PF','QAV','QAF','companyName','companyTicker','startDate','EndDate','return']]

  return final

In [None]:
final=merge(export_path,df_body_features,df_speaker_features,yield_filename)
final.head()

Unnamed: 0,FileNo,CV,CF,CNV,CNF,PV,PF,QAV,QAF,companyName,companyTicker,startDate,EndDate,return
0,13197438_T.xml,,,-0.631229,0.125,0.75,0.666667,0.0,0.0,BlackRock Inc,BLK,2020-05-19,2020-05-26,0.015044
1,13215635_T.xml,0.828283,0.28125,-0.888889,0.0,0.666667,0.0,0.0,0.0,Carnival Corp,CCL,2020-06-08,2020-06-15,-0.07507
2,12958669_T.xml,0.924623,0.265896,-0.981818,0.0,0.792683,0.260417,0.819444,0.0,Carnival Corp,CCL,2019-12-20,2019-12-27,0.020912
3,13105267_T.xml,0.96875,-0.5,,,0.875,-0.5,,,Verisk Analytics Inc,VRSK,2020-05-20,2020-05-27,-0.016924
4,12302678_T.xml,0.987342,0.928571,-0.971429,0.357143,,,0.860215,0.839506,Ingersoll-Rand PLC,IR,2019-02-21,2019-02-28,-0.01146


# **label the future change in return to classes**

In [None]:
 def map_return_to_classes(export_path,final):
   
  os.chdir(export_path)
  score =final["return"].values
  sentiment = []
  for i in score:
      
      
      if i >= 0.03 :
          sentiment.append('Large Increase')
      elif i <= -0.03 :
          sentiment.append('Large Decrease')
      elif 0.015<=i<=0.03:
          sentiment.append('Increase')
      elif -0.015>=i>=-0.03:
          sentiment.append('Decrease')
          
      else:
          sentiment.append('No Change')

          
  final['Sentiment']=sentiment
  final.to_csv('final.csv')

  return   final 


In [None]:
final=map_return_to_classes(export_path,final)
final.head()

Unnamed: 0,FileNo,CV,CF,CNV,CNF,PV,PF,QAV,QAF,companyName,companyTicker,startDate,EndDate,return,Sentiment
0,13197438_T.xml,,,-0.631229,0.125,0.75,0.666667,0.0,0.0,BlackRock Inc,BLK,2020-05-19,2020-05-26,0.015044,Increase
1,13215635_T.xml,0.828283,0.28125,-0.888889,0.0,0.666667,0.0,0.0,0.0,Carnival Corp,CCL,2020-06-08,2020-06-15,-0.07507,Large Decrease
2,12958669_T.xml,0.924623,0.265896,-0.981818,0.0,0.792683,0.260417,0.819444,0.0,Carnival Corp,CCL,2019-12-20,2019-12-27,0.020912,Increase
3,13105267_T.xml,0.96875,-0.5,,,0.875,-0.5,,,Verisk Analytics Inc,VRSK,2020-05-20,2020-05-27,-0.016924,Decrease
4,12302678_T.xml,0.987342,0.928571,-0.971429,0.357143,,,0.860215,0.839506,Ingersoll-Rand PLC,IR,2019-02-21,2019-02-28,-0.01146,No Change


# **Build Catboost Model**

In [None]:
def build_catboost_model(final):
  labels = final.loc[:, ['Sentiment']]
  veryfinal=final.drop(['FileNo','companyName','companyTicker','startDate','EndDate','return','Sentiment'], axis=1)
  X_train, X_test, y_train, y_test = train_test_split(veryfinal, labels, test_size=0.2)
  model_CBC = ctb.CatBoostClassifier()
  model_CBC.fit(X_train, y_train,plot=True)
  print(model_CBC)
  expected_y  = y_test
  predicted_y = model_CBC.predict(X_test)
  X_test['Sentiment']=y_test
  X_test['predicted_y']=predicted_y

  return X_test,y_test,predicted_y,model_CBC


In [None]:
X_test,y_test,predicted_y,model_CBC=build_catboost_model(final)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.060955
0:	learn: 1.0864585	total: 391us	remaining: 392ms
1:	learn: 1.0744604	total: 862us	remaining: 430ms
2:	learn: 1.0647700	total: 1.08ms	remaining: 360ms
3:	learn: 1.0551748	total: 1.3ms	remaining: 324ms
4:	learn: 1.0457175	total: 1.53ms	remaining: 304ms
5:	learn: 1.0363963	total: 1.76ms	remaining: 292ms
6:	learn: 1.0228736	total: 1.93ms	remaining: 273ms
7:	learn: 1.0137321	total: 2.11ms	remaining: 261ms
8:	learn: 1.0026768	total: 2.33ms	remaining: 256ms
9:	learn: 0.9917664	total: 2.62ms	remaining: 259ms
10:	learn: 0.9809995	total: 2.86ms	remaining: 257ms
11:	learn: 0.9703747	total: 3.1ms	remaining: 255ms
12:	learn: 0.9618229	total: 3.29ms	remaining: 249ms
13:	learn: 0.9487067	total: 3.47ms	remaining: 244ms
14:	learn: 0.9385097	total: 3.7ms	remaining: 243ms
15:	learn: 0.9258104	total: 3.93ms	remaining: 242ms
16:	learn: 0.9177251	total: 4.12ms	remaining: 239ms
17:	learn: 0.9097515	total: 4.36ms	remaining: 238ms
18:	learn: 0.9009093	total: 4.54ms	remaining: 234

# **Confusion Matrix for CatBoost**

In [None]:
#Making the Confusion Matrix for CatBoost
def confusion_matrix(y_test,predicted_y):

  confusion_matrix = pd.crosstab(X_test['Sentiment'], X_test['predicted_y'], rownames=['Actual'], colnames=['Predicted'])
  print (confusion_matrix)
  print("\naccuracy_score:")
  accuracy_score(y_test, predicted_y)
  print("\nprediction probability:")
  pred_probs = model_CBC.predict_proba(X_test.drop(['Sentiment', 'predicted_y'], axis=1))
  print(pred_probs)


In [None]:
confusion_matrix(y_test,predicted_y)

Predicted       No Change
Actual                   
Decrease                1
Large Decrease          1

accuracy_score:

prediction probability:
[[0.01121592 0.01390391 0.97488018]
 [0.01282354 0.01938792 0.96778855]]


# **Prediction Probability**

In [None]:
pred_probs = model_CBC.predict_proba(X_test.drop(['Sentiment', 'predicted_y'], axis=1))
pred_probs

array([[0.01121592, 0.01390391, 0.97488018],
       [0.01282354, 0.01938792, 0.96778855]])

# **Save the Model**

In [None]:
def save_model(model_CBC):
  pickle_out = open("classifier_example.pkl", mode = "wb") 
  pickle.dump(model_CBC, pickle_out) 
  pickle_out.close()

In [None]:
save_model(model_CBC)

# **Optimal parameters for CatBoost using GridSearchCV**

In [None]:
 def optimal_parameters(final):

  data = final[['CV', 'CF','CNV','CNF','PV','PF','QAV','QAF', 'return', 'Sentiment']]
  data = data[data['return'].isnull()==False]
  data.reset_index(drop=True, inplace=True)

  labels = data.loc[:, ['Sentiment']]
  veryfinal=data.drop(['Sentiment','return'], axis=1)
  X_train, X_test, y_train, y_test = train_test_split(veryfinal, labels, test_size=0.2)

  model_CBC = ctb.CatBoostClassifier()

  parameters = {'depth'         : [5,6,7],
                'n_estimators'  : [100, 200, 300, 500, 700],
                'learning_rate' : [0.01, 0.05, 0.1, 0.15, 0.2],
                'l2_leaf_reg'   : [3,5,10,30]
              }

  grid = GridSearchCV(estimator=model_CBC, param_grid = parameters, cv = 5, n_jobs=-1)
  grid.fit(X_train, y_train)

  print(" Results from Grid Search " )
  print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
  print("\n The best score across ALL searched params:\n", grid.best_score_)
  print("\n The best parameters across ALL searched params:\n", grid.best_params_)

  final_CB_w_parameters = ctb.CatBoostClassifier(depth=7, l2_leaf_reg=3, learning_rate= 0.05, n_estimators=100)
  final_CB_w_parameters.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

  y_test_pred = final_CB_w_parameters.predict(X_test)

  return y_test_pred,y_tes,X_testt

# **Main  Function**

In [None]:
def execute(import_path,export_path,export_path_body,export_path_speakers):
    

  filename=extract_file(import_path,export_path)
  files,valid_tickers_filename =extract_tickers(export_path,filename)
  yield_filename=get_return(export_path,valid_tickers_filename)

  split_clean_speakers_divisions_scores(import_path,export_path_body,export_path_speakers,files)
  df_speaker_features=speaker_features(export_path_speakers)   
  df_body_features=body_features(export_path_body)
  final=merge(export_path,df_body_features,df_speaker_features,yield_filename)
  final=map_return_to_classes(export_path,final)
  X_test,y_test,predicted_y,model_CBC=build_catboost_model(final)
  confusion_matrix(y_test,predicted_y)
  save_model(model_CBC)

  return print('Done!')

In [None]:
execute(import_path,export_path,export_path_body,export_path_speakers)

0
BLK
{'eventsData': {}, 'firstTradeDate': {'formatted_date': '1999-10-01', 'date': 938784600}, 'currency': 'USD', 'instrumentType': 'EQUITY', 'timeZone': {'gmtOffset': -14400}, 'prices': [{'date': 1589895000, 'high': 515.5800170898438, 'low': 498.8299865722656, 'open': 512.4600219726562, 'close': 501.20001220703125, 'volume': 2099700, 'adjclose': 476.27960205078125, 'formatted_date': '2020-05-19'}, {'date': 1589981400, 'high': 513.0, 'low': 503.4599914550781, 'open': 509.55999755859375, 'close': 508.739990234375, 'volume': 1496400, 'adjclose': 483.4447326660156, 'formatted_date': '2020-05-20'}, {'date': 1590067800, 'high': 509.6300048828125, 'low': 504.1499938964844, 'open': 508.0799865722656, 'close': 508.510009765625, 'volume': 1007600, 'adjclose': 483.2261657714844, 'formatted_date': '2020-05-21'}, {'date': 1590154200, 'high': 513.5700073242188, 'low': 503.0, 'open': 506.0, 'close': 513.2999877929688, 'volume': 632700, 'adjclose': 487.77801513671875, 'formatted_date': '2020-05-22'}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
BLK
2020-05-19
2020-05-26
[{'date': 1589895000, 'high': 515.5800170898438, 'low': 498.8299865722656, 'open': 512.4600219726562, 'close': 501.20001220703125, 'volume': 2099700, 'adjclose': 476.2796325683594, 'formatted_date': '2020-05-19'}, {'date': 1589981400, 'high': 513.0, 'low': 503.4599914550781, 'open': 509.55999755859375, 'close': 508.739990234375, 'volume': 1496400, 'adjclose': 483.44476318359375, 'formatted_date': '2020-05-20'}, {'date': 1590067800, 'high': 509.6300048828125, 'low': 504.1499938964844, 'open': 508.0799865722656, 'close': 508.510009765625, 'volume': 1007600, 'adjclose': 483.2261962890625, 'formatted_date': '2020-05-21'}, {'date': 1590154200, 'high': 513.5700073242188, 'low': 503.0, 'open': 506.0, 'close': 513.2999877929688, 'volume': 632700, 'adjclose': 487.77801513671875, 'formatted_date': '2020-05-22'}]
1
CCL
2020-06-08
2020-06-15
[{'date': 1591623000, 'high': 25.280000686645508, 'low': 23.40999984741211, 'open': 24.56999969482422, 'close': 24.90999984741211,

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.060955
0:	learn: 1.3722609	total: 422us	remaining: 422ms
1:	learn: 1.3601372	total: 702us	remaining: 351ms
2:	learn: 1.3442035	total: 916us	remaining: 305ms
3:	learn: 1.3305995	total: 1.14ms	remaining: 283ms
4:	learn: 1.3133558	total: 1.34ms	remaining: 266ms
5:	learn: 1.2963741	total: 1.53ms	remaining: 253ms
6:	learn: 1.2832696	total: 1.77ms	remaining: 251ms
7:	learn: 1.2749076	total: 1.98ms	remaining: 245ms
8:	learn: 1.2600469	total: 2.19ms	remaining: 242ms
9:	learn: 1.2534809	total: 2.35ms	remaining: 232ms
10:	learn: 1.2408332	total: 2.59ms	remaining: 233ms
11:	learn: 1.2328518	total: 2.75ms	remaining: 227ms
12:	learn: 1.2202707	total: 2.92ms	remaining: 221ms
13:	learn: 1.2061979	total: 3.07ms	remaining: 216ms
14:	learn: 1.1940643	total: 3.44ms	remaining: 226ms
15:	learn: 1.1863375	total: 3.62ms	remaining: 222ms
16:	learn: 1.1743526	total: 3.77ms	remaining: 218ms
17:	learn: 1.1594215	total: 3.97ms	remaining: 217ms
18:	learn: 1.1492184	total: 4.18ms	remaining: 2