# **Load Libraries**

In [None]:
import streamlit as st
import os 
import pandas as pd
import re
import pickle
import catboost as ctb

from pandas import option_context
import numpy as np
from pathlib import Path
import xml.etree.ElementTree as ET
import datetime
from pandas.tseries.offsets import BDay

from io import StringIO
import re
import string
from unidecode import unidecode
from collections import Counter
from string import digits
from copy import deepcopy

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from yahoofinancials import YahooFinancials
import yfinance as yf


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from PIL import Image


from gensim.summarization import summarize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from wordcloud import STOPWORDS

import spacy
nlp=spacy.load('en_core_web_sm')
from spacy import displacy
import seaborn as sns
import base64
import time
timestr = time.strftime("%Y%m%d-%H%M%S")

# **Text Preprocessing**

In [None]:
reSentenceSplitter = re.compile(
    "(?i:\\bYum!\\s*(?=Brands?\\b))|" +
    "(?i:\\b(?:inc|corp|ltd)\\.\\s*\\()|" +
    "(?i:\\b(?:mrs|messrs|sen|esq|adv|prof|rev|gov|gen|rep|hon|adj|oblig|tbk|dev|inv|invs|opn|constr|conces|(?-i:Med|Cap|Develop|Met))\\.\\s*(?=[A-Z'\"\\(]))|" +
    "\\b(?i:[a-z][a-z][a-z]+(?:'t)?|" +
    "(?-i:we|us|in|to|at|of|on|as|be|by|do|go|he|is|it|me|so|up|no|US|UK|EU|MW|)|" +
    "'s|" +
    "[A-Z]&(?:amp;)?[A-Z]+|" +
    "\\s*\\([A-Z]+\\)|" +
    "\\d+[%$sx]?|" +
    "\\)(?=\\.)|" +
    "[A-Z]+\\d+)" +
    "[\"'\\)]*[!\\.\\?]+[\"'\\)]*( +)" +
    "(?=" + "(?:<[^\\>][^>]*>)*" +
    "(?:[\"']+(?:<[^\\>][^>]*>)*)?" +
    "\\(?" +
    "(?:[A-Z][a-z]|Q[1-4]|\\d+[A-Z]+ |[AI] |I'|U\\.S\\b|[A-Z]+\\b|(?:19|20)\\d\\d ))") 

def NormalizeString(s, doUnescape=False):
 
    filler_words=['Hello','hello','Thanks','Thank you','thank you','good morning', 'good morning','everyone','Good morning','Good day','good day',
                  'bye','Bye','hey','Hi','welcome','Welcome','Good afternoon','Good evening','good afternoon','good evening','yea','yeah',"We'll talk to you soon.","We'll talk to you soon.","ladies and gentlemen",'Have a good day.','Have a nice day','Have a','Okay']
    s = re.sub(r'\[\[.*?\]\]', '', s)
  #  s = unidecode(s)
    s = re.sub(r'[<{]', '[', s)
    s = re.sub(r'[>}]', ']', s)
    s = s.replace('(', '')
    s = s.replace(')', '')
    s = s.replace(' .', '.')
    s = s.replace('*', '')
    s = s.replace('=','')
    s = s.replace('-','')    
    s=re.sub("[\(\[].*?[\)\]]", "", s)
    s = s.replace('[]','')
    for word in filler_words:
         s = s.replace(word, '')
    return re.sub(r'\s+', ' ', s).strip()


def GetSentences(text):
  #  text = text.replace('\n\n', '\n')
    sentences = []
    for paragraph in text.split('\n'):
        paragraph = paragraph.strip()
        if not paragraph: continue
        paragraph = NormalizeString(paragraph)
        iAt = 0
        for M in reSentenceSplitter.finditer(paragraph):
            if not M.group(1): continue
            sent = paragraph[iAt: M.start(1)].strip()
            iAt = M.end(1)
            if not sent: continue
            sentences.append(sent)
        sent = paragraph[iAt:].strip()
        if not sent: continue
        sentences.append(sent)
    return sentences

# **Divide the text according to participants: corporate and conference participants**

In [None]:
def speakers_lists(text,comp_name):
    
    corp_idx = text.find('Corporate Participants')+len('Corporate Participants\n')
    corp_idx_end = text.find('\nConference Call Participants')
    conf_idx = text.find('Conference Call Participants') + len('Conference Call Participants\n')
    conf_idx_end = text.find('Presentation')
    corp_txt = text[corp_idx:corp_idx_end].split('\n')
    comp_name=' '.join(comp_name)
    corp_name = [x.split(comp_name.split()[0])[0] for x in corp_txt]
    corp_name = [x.strip() for x in corp_name if x.strip() != '']
    corp_speaker1 = corp_name     # for cases where first name, initial. last name
    corp_speaker2 = [x.split()[0] + ' ' + x.split()[-1] for x in corp_name]
    conf_txt = text[conf_idx:conf_idx_end].split('\n')
    conf_speaker1 = [x.split()[0] + ' ' + x.split()[1] for x in conf_txt if len(x.split()) >= 2]
    conf_speaker2 = [x.split()[0] + ' ' + x.split()[2] for x in conf_txt if len(x.split()) >= 3]    # accounts for speakers with middle name
    conf_speaker3 = [x.split()[0] + ' ' + x.split()[1] + ' ' + x.split()[2] for x in conf_txt if len(x.split()) >=3]    # accounts for speakers with middle name
    speaker_list = corp_speaker1 + corp_speaker2 + conf_speaker1 + conf_speaker2 + conf_speaker3 
    columns = ['corp_particip', 'conf_particip']
    df_speaker = pd.DataFrame(columns=columns)
    corp_list = corp_speaker1 + corp_speaker2 + ['Unidentified Company Representative']
    conf_list = conf_speaker1 + conf_speaker2 + conf_speaker3 + ['Unidentified Participant']+['Unidentified Participant']+['Unidentified Analyst']+['Unidentified Audience Member']
    df_speaker['corp_particip'] = corp_list + (max(len(corp_list), len(conf_list)) -len(corp_list)) * ['']
    df_speaker['conf_particip'] = conf_list + (max(len(corp_list), len(conf_list)) -len(conf_list)) * ['']
    
    list1=[NormalizeString(s, doUnescape=False) for s in df_speaker['corp_particip']]
    list1=[ele for ele in list1 if ele.strip()]
    list2=[NormalizeString(s, doUnescape=False) for s in df_speaker['conf_particip'] ]
    list2=[ele for ele in list2 if ele.strip()]

        
    list1[:] = (value for value in list1 if value != 'Unidentified Audience Member')
    list1[:] = (value for value in list1 if value != 'Unidentified Analyst')
    list1[:] = (value for value in list1 if value != 'Unidentified Participant')
    
    return list1,list2

In [None]:
def split_speakers(body,comp_name):
    
  #  txt=' '.join(body)
    
    corp_particip,conf_particip=speakers_lists(body,comp_name)
    
  #  body = body[0]
    body_separated_list = body.split('--------------------------------------------------------------------------------')

    speakers_df = pd.DataFrame(index=list(range(len(body_separated_list)//2)), columns=['speaker', 'Text', 'position', 'is_corp'])

    for i in range(1, len(body_separated_list), 2):
        speakers_df.iloc[i//2]= np.array([body_separated_list[i], body_separated_list[i+1], False, '0x'])

    speakers_df = speakers_df.replace(r'\n',' ', regex=True) 

    
    s_list_speaker = speakers_df['speaker'].tolist()
    s_list_content = speakers_df['Text'].tolist()

    res_speaker = [NormalizeString(s, doUnescape=False) for s in s_list_speaker]
    res_content = [NormalizeString(s, doUnescape=False) for s in s_list_content]

    speakers_df['speaker'] = np.array(res_speaker)
    speakers_df['Text'] = np.array(res_content)
    
    
    speakers_df['speaker_name'] = speakers_df['speaker'].str.split(',', expand=True)[0]

    try:

      speakers_df['position'] = speakers_df['speaker'].str.split(',', expand=True)[1]

    except:

      speakers_df['position']=''.join(['Unidentified'])



    speakers_df['is_corp'] = speakers_df['speaker_name'].isin(corp_particip)
    

    speakers_df.columns=['speaker_details','Text','speaker_position','is_corp','speaker_name']
    speakers_df = speakers_df[['speaker_details', 'speaker_name', 'speaker_position', 'is_corp','Text']]
    
    return speakers_df

In [None]:
def split_speakers_text(df):
    
        
    df_sentences_speakers = pd.DataFrame(columns=list(df.columns))
    
    for speaker in df.index:

        tmp = deepcopy(df.loc[speaker])
    
        try:
            tmp_sents = GetSentences(tmp['Text'])
            tmp_df = pd.DataFrame(columns=df_sentences_speakers.columns)
            tmp_df['Text'] = tmp_sents
            tmp_df['speaker_details'] = tmp['speaker_details']
            tmp_df['speaker_name'] = tmp['speaker_name']
            tmp_df['speaker_position'] = tmp['speaker_position']
            tmp_df['is_corp'] = tmp['is_corp']
            df_sentences_speakers = pd.concat([df_sentences_speakers, tmp_df])


        except:

            df_list=df_sentences_speakers['Text'].tolist()
            df_sentences_speakers['Finbert'] = finbert_senti(df_list)
            df_sentences_speakers['Vader'] = vader_senti(df_list)
            df_sentences_speakers.reset_index(drop=True, inplace=True)
   




    df_list=df_sentences_speakers['Text'].tolist()
    df_sentences_speakers['Finbert'] = finbert_senti(df_list)
    df_sentences_speakers['Vader'] = vader_senti(df_list)
    df_sentences_speakers.reset_index(drop=True, inplace=True)
   

    return df_sentences_speakers


# **Sentiment Analysis Models: Vader and Finbert**

In [None]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
labels = {0:'Neutral', 1:'Positive',2:'Negative'}



def finbert_senti(X):
    
    sent_val = list()
    for x in X:
        inputs = tokenizer(x, return_tensors="pt", padding=True)
        outputs = finbert(**inputs)[0]
        val = labels[np.argmax(outputs.detach().numpy())]
        sent_val.append(val)
        
    return sent_val

Downloading:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

In [None]:
def vader_senti(List):
    vs={}
    analyzer = SentimentIntensityAnalyzer()
    for sentence in List :
        vs[sentence]= analyzer.polarity_scores(sentence)
    df = pd.DataFrame.from_dict(vs,orient ='index')
    df = df.reset_index(False)
    
    score = df["compound"].values
    sentiment = []
    for i in score:

        if i > 0.20 :
            sentiment.append('Positive')
        elif i <= -0.2 :
            sentiment.append('Negative')
        else:
            sentiment.append('Neutral')

    df["Sentiment"] = sentiment
    
       
    return df["Sentiment"]

# **Divide each text to two parts: Presentation and Q&A**

In [None]:
def text_divisions(df):  
    

    try:
        
        presentation=df[df['Text']=='Presentation'].index

        try:
            
            QA=df[df['Text']=='Questions and Answers'].index
            presentation_df=df.iloc[presentation[0]+1:QA[0]]
            presentation_df.columns=['Presentation']
            QA_df=df.iloc[QA[0]+1:]
            QA_df.columns=['Questions and Answers']
            presentation_df.reset_index(drop=True, inplace=True)
            QA_df.reset_index(drop=True, inplace=True)
            dfnew = pd.concat([presentation_df, QA_df],axis=1)
            dfnew=dfnew[['Presentation','Questions and Answers']]

        except:

            presentation_df=df.iloc[presentation[0]+1:]
            presentation_df.columns=['Presentation']
            dfnew=presentation_df[['Presentation']]


    except:

        QA=df[df['Text']=='Questions and Answers'].index
        QA_df=df.iloc[QA[0]+1:]
        QA_df.columns=['Questions and Answers']
        dfnew=QA_df[['Questions and Answers']]        


    dfnew=dfnew.fillna("")
        
    return dfnew

In [None]:
def text_sentiment(text):         
    
    
    text=GetSentences(text)
    np_array=np.asarray(text)
    reshaped_array=np_array.transpose()
    df = pd.DataFrame (reshaped_array, columns = ['Text'])
    df=text_divisions(df)

    try:
        pres_list=df['Presentation'].tolist()

        try:
            QA_list=df['Questions and Answers'].tolist()
            df['Pres_Vader']=vader_senti(pres_list)
            df['Pres_Finbert']=finbert_senti(pres_list)
            df['QA_Vader']=vader_senti(QA_list)
            df['QA_Finbert']=finbert_senti(QA_list)
            df=df[['Presentation','Pres_Vader','Pres_Finbert','Questions and Answers','QA_Vader','QA_Finbert']]


        except:   

            df['Pres_Vader']=vader_senti(pres_list)
            df['Pres_Finbert']=finbert_senti(pres_list)
            df=df[['Presentation','Pres_Vader','Pres_Finbert']]

    except:

        QA_list=df['Questions and Answers'].tolist()
        df['QA_Vader']=vader_senti(QA_list)
        df['QA_Finbert']=finbert_senti(QA_list)
        df=df[['Questions and Answers','QA_Vader','QA_Finbert']]

        
    return df

# **Participants Features**

In [None]:
def positive(val):
    return val[val=='Positive'].sum()
def negative(val):
    return val[val=='Negative'].sum() 
def neutral(val):
    return val[val=='Neutral'].sum() 

In [None]:
def speakers_features(df):
        

    try:

        groupRes=df.groupby(df['is_corp']).get_group(True)
        
        pos_v=groupRes[groupRes['Vader']=='Positive']['Vader'].count()
        neg_v=groupRes[groupRes['Vader']=='Negative']['Vader'].count()

        # #new feature:
        corp_vader=(pos_v-neg_v)/(pos_v+neg_v+1)

        neg_f=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
        pos_f=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()

        #new feature:
        corp_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)



        try:

            
            groupRes=df.groupby(df['is_corp']).get_group(False)

            neg_v1=groupRes[groupRes['Vader']=='Positive']['Vader'].count()
            pos_v1=groupRes[groupRes['Vader']=='Negative']['Vader'].count()


            #new feature:
            conf_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)


            neg_f1=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
            pos_f1=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()

            #new feature:
            conf_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)
       

            features_df_speakers=pd.DataFrame(columns=['CV','CF','CNV','CNF'])

            features_df_speakers['CV']=[corp_vader]
            features_df_speakers['CF']=[corp_finbert]
            features_df_speakers['CNV']=[conf_vader]
            features_df_speakers['CNF']=[conf_finbert]
                                        


        except:


            pos_v=groupRes[groupRes['Vader']=='Positive']['Vader'].count()
            neg_v=groupRes[groupRes['Vader']=='Negative']['Vader'].count()

            # #new feature:
            corp_vader=(pos_v-neg_v)/(pos_v+neg_v+1)


            neg_f=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
            pos_f=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()


            #new feature:
            corp_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)
            features_df_speakers=pd.DataFrame(columns=['CV','CF','CNV','CNF'])

            features_df_speakers['CV']=[corp_vader]
            features_df_speakers['CF']=[corp_finbert]
            features_df_speakers['CNV']=[0]
            features_df_speakers['CNF']=[0]



    except:



        groupRes=df.groupby(df['is_corp']).get_group(False)


        neg_v1=groupRes[groupRes['Vader']=='Positive']['Vader'].count()
        pos_v1=groupRes[groupRes['Vader']=='Negative']['Vader'].count()
  

        #new feature:
        conf_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)
   


        neg_f1=groupRes[groupRes['Finbert']=='Negative']['Finbert'].count()
        pos_f1=groupRes[groupRes['Finbert']=='Positive']['Finbert'].count()
 

        #new feature:
        conf_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)


        features_df_speakers=pd.DataFrame(columns=['CV','CF','CNV','CNF'])
        features_df_speakers['CV']=[]
        features_df_speakers['CF']=[]
        features_df_speakers['CNV']=[conf_vader]
        features_df_speakers['CNF']=[conf_finbert]


    return features_df_speakers

# **Body Features**

In [None]:
def body_features(df):

    try:

        groupRes=df.groupby(df['Presentation'])

        tmp=groupRes['Pres_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

        neg_v=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
        pos_v=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()


        #new feature:
        pres_vader=(pos_v-neg_v)/(pos_v+neg_v+1)


        tmp=groupRes['Pres_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

        neg_f=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
        pos_f=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()


        #new feature:
        pres_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)

        try:

            #################################################
            groupRes1=df.groupby(df['Questions and Answers'])
            tmp=groupRes['QA_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

            neg_v1=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
            pos_v1=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()

            #new feature:
            QA_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)


            tmp=groupRes['QA_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

            neg_f1=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
            pos_f1=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()

            #new feature:
            QA_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)



            features_df=pd.DataFrame(columns=['PV','PF','QAV','QAF'])

            features_df['PV']=[pres_vader]
            features_df['PF']=[pres_finbert]
            features_df['QAV']=[QA_vader]
            features_df['QAF']=[QA_finbert]


        except:



            tmp=groupRes['Pres_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

            neg_v=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
            pos_v=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()


            #new feature:
            pres_vader=(pos_v-neg_v)/(pos_v+neg_v+1)


            tmp=groupRes['Pres_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

            neg_f=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
            pos_f=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()


            #new feature:
            pres_finbert=(pos_f-neg_f)/(pos_f+neg_f+1)




            features_df=pd.DataFrame(columns=['PV','PF','QAV','QAF'])

            features_df['PV']=[pres_vader]
            features_df['PF']=[pres_finbert]
            features_df['QAV']=[0]
            features_df['QAF']=[0]


    #######################################################################################

    except:




        groupRes1=df.groupby(df['Questions and Answers'])
        tmp=groupRes['QA_Vader'].agg([('Vader_neg',negative),('Vader_pos',positive)])

        neg_v1=tmp[tmp['Vader_neg']!=0]['Vader_neg'].count()
        pos_v1=tmp[tmp['Vader_pos']!=0]['Vader_pos'].count()

        #new feature:
        QA_vader=(pos_v1-neg_v1)/(pos_v1+neg_v1+1)


        tmp=groupRes['QA_Finbert'].agg([('finbert_neg',negative),('finbert_pos',positive)])

        neg_f1=tmp[tmp['finbert_neg']!=0]['finbert_neg'].count()
        pos_f1=tmp[tmp['finbert_pos']!=0]['finbert_pos'].count()

        #new feature:
        QA_finbert=(pos_f1-neg_f1)/(pos_f1+neg_f1+1)



        features_df=pd.DataFrame(columns=['PV','PF','QAV','QAF'])
        features_df['PV']=[]
        features_df['PF']=[]
        features_df['QAV']=[QA_vader]
        features_df['QAF']=[QA_finbert]


    return features_df

# **File Details Extraction**

In [None]:
def extract_file(uploaded_file):
    
    body=[]
    texts=[] 

    tree = ET.parse(uploaded_file)
    root = tree.getroot()
    for elem in root.iter('Body'):
        body.append(elem.text)

    for child in root:
        texts.append(child.text)   


    np_array=np.asarray(texts)
    reshaped_array =np_array.transpose()
    reshaped_array=reshaped_array.reshape(1,6)


    df = pd.DataFrame(reshaped_array , columns=[
    'EventStory',
    'eventTitle',
    'city',
    'companyName',
    'companyTicker',
    'startDate'])

    #clean the column from newline symbol
    df = df.replace(r'\n',' ', regex=True) 
    df=df[['companyName','companyTicker','startDate']]
    comp_name=df['companyName']
    body=''.join(body)
    
    return body,df,comp_name

# **Summarize**

In [None]:
def sumy_summarizer(input_text):
    parser =PlaintextParser.from_string(input_text,Tokenizer("english"))
    lex_summarizer = LexRankSummarizer()
    summary = lex_summarizer(parser.document,3)
    summary_list = [str(sentence) for sentence in summary]
    result = ' '.join(summary_list)
    return result

# **Visualization**

In [None]:
def sentiment_visualization_speakers(df):
    
    
    try:    
        
        fig1 = plt.figure(figsize=(6,6), dpi=100)
        df.groupby(df['is_corp']).get_group(True)['Finbert'].value_counts().plot.bar(color=[ 'red', 'green', 'blue'])
        plt.title("Corporate Participants Talk -Sentiment Analysis by Finbert ")
        st.pyplot(fig1)
        fig2 = plt.figure(figsize=(6,6), dpi=100)
        df.groupby(df['is_corp']).get_group(True)['Vader'].value_counts().plot.bar(color=[ 'red', 'green', 'blue'])
        plt.title("Corporate Participants Talk -Sentiment Analysis by Vader ")
        st.pyplot(fig2)
        
        try:
            fig3 = plt.figure(figsize=(6,6), dpi=100)
            df.groupby(df['is_corp']).get_group(False)['Vader'].value_counts().plot.bar(color=[ 'red', 'green', 'blue'])
            plt.title("Conference Call Participants Talk -Sentiment Analysis by Vader ")
            st.pyplot(fig3)
            fig4 = plt.figure(figsize=(6,6), dpi=100)
            df.groupby(df['is_corp']).get_group(False)['Finbert'].value_counts().plot.bar(color=[ 'red', 'green', 'blue'])
            plt.title("Conference Call Participants Talk-Sentiment Analysis by Finbert ")
            st.pyplot(fig3)
        
        except:
            
            
                 pass
        
    except:
        fig3 = plt.figure(figsize=(6,6), dpi=100)
        df.groupby(df['is_corp']).get_group(False)['Vader'].value_counts().plot.bar(color=[ 'red', 'green', 'blue'])
        plt.title("Conference Call Participants Talk -Sentiment Analysis by Vader ")
        st.pyplot(fig3)
        fig4 = plt.figure(figsize=(6,6), dpi=100)
        df.groupby(df['is_corp']).get_group(False)['Finbert'].value_counts().plot.bar(color=[ 'red', 'green', 'blue'])
        plt.title("Conference Call Participants Talk-Sentiment Analysis by Finbert ")
        st.pyplot(fig4)
        
        


In [None]:
def sentiment_visualization_text(df):
    
    try:    

        fig1 = plt.figure(figsize=(6,6), dpi=100)
        ax = plt.subplot(111)
        df.groupby(df['Pres_Vader']).size().plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="") 
        ax.set_title("Presentation Part -Sentiment Analysis by Vader ")
        st.pyplot(fig1)

        fig2 = plt.figure(figsize=(6,6), dpi=100)
        ax = plt.subplot(111)
        df.groupby(df['Pres_Finbert']).size().plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="") 
        ax.set_title("Presentation Part -Sentiment Analysis by Finbert ")
        st.pyplot(fig2)
        
        
        
        try:
            fig3 = plt.figure(figsize=(6,6), dpi=100)
            ax = plt.subplot(111)
            df.groupby(df['QA_Vader']).size().plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="") 
            ax.set_title("Questions and Answers Part -Sentiment Analysis by Vader ")
            st.pyplot(fig3)
            fig4 = plt.figure(figsize=(6,6), dpi=100)
            ax = plt.subplot(111)
            df.groupby(df['QA_Finbert']).size().plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="") 
            ax.set_title("Questions and Answers Part -Sentiment Analysis by Finbert ")
            st.pyplot(fig4)
        
        
        except:
            
            
                 pass
        
    except:

        fig3 = plt.figure(figsize=(6,6), dpi=100)
        ax = plt.subplot(111)
        df.groupby(df['QA_Vader']).size().plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="") 
        ax.set_title("Questions and Answers Part -Sentiment Analysis by Vader ")
        st.pyplot(fig3)
        fig4 = plt.figure(figsize=(6,6), dpi=100)
        ax = plt.subplot(111)
        df.groupby(df['QA_Finbert']).size().plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="") 
        ax.set_title("Questions and Answers Part -Sentiment Analysis by Finbert ")
        st.pyplot(fig4)

    
    
    

In [None]:
def body_sentiment(text):
    
    text=GetSentences(text)
    np_array=np.asarray(text)
    reshaped_array=np_array.transpose()
    df = pd.DataFrame (reshaped_array, columns = ['Text'])
    df['Vader']=vader_senti(df['Text'])
    df['Finbert']=finbert_senti(df['Text'])
    
    return df

In [None]:
def visualization(text):
    
    df=body_sentiment(text)
    sentiment_counts_Finbert= df.groupby(df['Finbert']).size()
    sentiment_counts_Vader= df.groupby(df['Vader']).size()

    fig1 = plt.figure(figsize=(6,6), dpi=100)
    ax = plt.subplot(111)
    sentiment_counts_Finbert.plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="") 
    ax.set_title('Sentiment Analysis by Finbert')
    st.pyplot(fig1)
    fig2 = plt.figure(figsize=(6,6), dpi=100)
    ax = plt.subplot(111)
    sentiment_counts_Vader.plot.pie(ax=ax, autopct='%1.1f%%', startangle=270, fontsize=12, label="")
    ax.set_title('Sentiment Analysis by Vader')
    st.pyplot(fig2)

In [None]:
def pos_neg_wordcloud(text):

    df=body_sentiment(text)
    positive_tweets = df['Text'][df["Finbert"] == 'Positive']


    stop_words = ["https", "co", "RT"] + list(STOPWORDS)
    positive_wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white", stopwords = stop_words).generate(str(positive_tweets))
    fig1 = plt.figure()
    plt.title("Positive - Wordcloud",fontsize=18)
    plt.imshow(positive_wordcloud, interpolation="bilinear")
    plt.axis("off")
    st.pyplot(fig1)

    
    negative_tweets = df['Text'][df["Finbert"] == 'Negative']
    stop_words = ["https", "co", "RT"] + list(STOPWORDS)
    negative_wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white",colormap='Set2', stopwords = stop_words).generate(str(negative_tweets))
    fig2 = plt.figure()
    plt.title("Negative - Wordcloud",fontsize=18)
    plt.imshow(negative_wordcloud, interpolation="bilinear")
    plt.axis("off")
    st.pyplot(fig2)

In [None]:
def text_analyzer(text):
    
    doc=nlp(text)
    alldata=[]
    for token in doc:
        alldata.append([token.text,token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop])
    df=pd.DataFrame(alldata,columns=['Token','Shape','Pos','Tag','Lemma','IsAlpha','Is_Stopword'])
 
    return df


In [None]:
def plot_probability(loaded_model,X_test):
    
    
    df=pd.DataFrame(data=loaded_model.predict_proba(X_test), columns=loaded_model.classes_)
    transposed=df.T
    transposed.columns=['Classes']
    st.bar_chart(transposed)


# **CSV Downloader**

In [None]:
def csv_downloader(data):
	csvfile = data.to_csv()
	b64 = base64.b64encode(csvfile.encode()).decode()
	new_filename = "new_text_file_{}_.csv".format(timestr)
	href = f'<a href="data:file/csv;base64,{b64}" download="{new_filename}"> Download CSV </a>'
	st.markdown(href,unsafe_allow_html=True)

# **Main FUnction**

In [None]:
def main():

    activity1 = ["Home","EDA","Summarize","Predict"]
    choice = st.sidebar.selectbox("Select Function",activity1)
    

    if choice =='Home':
        


        os.chdir('/content/drive/MyDrive/Seminar /')
        image= Image.open('Streamlit About.jpg')
        st.image(image,width=700)
        
        
        
    if choice == 'EDA':
        
        st.title("Exploratory Data Analysis")
        uploaded_file = st.file_uploader("Upload xml file",type=['xml'])
        if uploaded_file:
            
            body,df,comp_name=extract_file(uploaded_file)
            df['Original Text']=body
            
            with st.expander('Text Details'):
                st.write(df)
                
                flag = True
                if st.button('Plot Wordcloud'):

                    if flag:
                        pos_neg_wordcloud(body)
                    flag=False
                        
                        
            clean_text=st.sidebar.checkbox("Text Preprocessing")
            with st.expander("Clean and Split Text"):

                if clean_text:
                    
                    out = GetSentences(body)
                    np_array=np.asarray(out)
                    reshaped_array=np_array.transpose()
                    df = pd.DataFrame (reshaped_array, columns = ['Processed Text'])
                    st.dataframe(df)
                    csv_downloader(df)

                    
                    flag = True
                    if st.button('Plot POS Tag'):

                        if flag:
                            
                            token_result_df=text_analyzer(body)
                            fig= plt.figure()
                            sns.countplot(token_result_df['Pos'])
                            plt.xticks(rotation = 45)
                            st.pyplot(fig)
                            
                        flag=False

           
            with st.expander("Participants Talk Sentiment"):

                participants_talk_sentiment=st.sidebar.checkbox("Participants Talk Sentiment")
                if participants_talk_sentiment:

                    df = split_speakers_text(split_speakers(body,comp_name))
                    flag = True
                    if st.button('Plot participants sentiment'):

                        if flag:
                        
                            sentiment_visualization_speakers(df)
                            
                        flag=False    
                        
                        
                st.write(df)
                        
            with st.expander("Text Sentiment"):            

                text_sentiments=st.sidebar.checkbox("Text Sentiment")
                
                if text_sentiments:
                    
                    
                    df = text_sentiment(body)                  
                    flag = True
                    if st.button('Plot text sentiment'):

                        if flag:
                        
                            sentiment_visualization_text(df)
                        flag=False
    

                st.write(df)
                   
                    
    if choice == 'Summarize':

        st.title("Summary with NLP")
        uploaded_file = st.file_uploader("Upload xml file ",type=['xml'])
        summary_choice = st.selectbox("Summary Choice",["Genism","Sumy Lex Rank"])
        if uploaded_file:

            body,df,comp_name=extract_file(uploaded_file)
            
            
            if st.button("Summarize"):
                
                if summary_choice == "Genism":
                    summary_result = summarize(body)
                    
                elif summary_choice == "Sumy Lex Rank":
                    summary_result = sumy_summarizer(body)
                    
                st.write(summary_result)

        

    if choice == 'Predict':

        os.chdir('/content/drive/MyDrive/Seminar /')

        st.title("Stock Price Changes Forcasting ")
        uploaded_file = st.file_uploader("Upload xml file",type=['xml'])
        
        if uploaded_file:
            
            body,df,comp_name=extract_file(uploaded_file)
    
        out, flag = '',True
        if st.button('Predict'):
            
            if flag:

                loaded_model = pickle.load(open('classifier.pkl', 'rb'))
                X_test=pd.concat([speakers_features(split_speakers_text(split_speakers(body,comp_name))),body_features(text_sentiment(body))], axis=1)
                out = loaded_model.predict(X_test)
                out=''.join(str(x) for x in out)
                
                
              
            st.write(out)
            
            with st.expander("Prediction Probability"):
                
                plot_probability(loaded_model,X_test)
        
            
               
                    
                
        


In [None]:
main()

# **Note**

#  1. to run the application, you should save the notebook as Pyscript 

#  2. to open the terminal and to write this command: streamlit run app.py 