In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
#Import the annuals.PY file (This contains some PDF-to-text() and the text_cleaning() functions)
import annuals as an


In [None]:
#Set to the directory containing the folder "Banks".
#The Banks folder contains multiple subfolders with the names of the Banks, e.g. Dhanalakshmi, Lakshmi Vilas 
#Each Subfolder contains 2 subfolders, viz, "Auditor Report" & "Director Report".

os.chdir(r"C:\Users\goura\Documents\000. Research\Catching the sentiments of the Annual Reports")

### Sentiment Scores using the Harvard-IV Dictionary

In [None]:
#Path should contain "\\" intead of "\" or "/"
import pysentiment2 as ps

def sentiscore_harvard(path, bank_name, report):
    """
    path = The path of the "Banks" folder
    bank_name = Name of the Bank of interest (same as its folder name)
    report = "Auditor Report" or "Director Report"
    """
    
    #This sets the link to the folder containing the reports in PDF
    link = path + "\\" + bank_name + "\\" + report
    os.chdir(link)
    
    #Initialing a dictionary with empty list as values. This will be later converted to a data frame
    read_scores = {"Year": [], "Length": [], "NPositive": [], "NNegative": [], "PPositive": [], "PNegative": [], "Polarity": [], "Subjectivity": []}

    for file in os.listdir():
        if ".pdf" in file:
            document = an.PDF2text(file)
            cleaned_file = an.cleaning_pipeline(document)
            hiv4 = ps.HIV4()
            tokens = hiv4.tokenize(cleaned_file)
            score = hiv4.get_score(tokens)

            if len(cleaned_file) != 0:
                #Readibility scores
                read_scores["Length"].append(len(tokens))
                read_scores["NPositive"].append(score['Positive'])
                read_scores["NNegative"].append(score['Negative'])
                read_scores["PPositive"].append(round(score['Positive']/len(tokens),4))
                read_scores["PNegative"].append(round(score['Negative']/len(tokens),4))
                read_scores["Polarity"].append(round(score['Polarity'],4))
                read_scores["Subjectivity"].append(round(score['Subjectivity'],4))
                #Year
                read_scores["Year"].append(int(file[file.find("_")+1:file.find("-")]))
            else:
                read_scores["Length"].append(np.NaN)
                read_scores["NPositive"].append(np.NaN)
                read_scores["NNegative"].append(np.NaN)
                read_scores["PPositive"].append(np.NaN)
                read_scores["PNegative"].append(np.NaN)
                read_scores["Polarity"].append(np.NaN)
                read_scores["Subjectivity"].append(np.NaN)  
                read_scores["Year"].append(int(file[file.find("_")+1:file.find("-")]))
            print(file)
    
    #Converting the read_scores dictionary to a dataframe
    df = pd.DataFrame(read_scores)
    df["Bank"] = bank_name
    df["Report"] = report
    df["Dictionary"] = "Harvard-IV"
    return(df)


### Sentiment Scores using the Loughran & McDonald's Dictionary

In [None]:
#Path should contain "\\" intead of "\" or "/"
import pysentiment2 as ps

def sentiscore_Loughran(path, bank_name, report):
    link = path + "\\" + bank_name + "\\" + report
    os.chdir(link)
    read_scores = {"Year": [], "Length": [], "NPositive": [], "NNegative": [], "PPositive": [], "PNegative": [], "Polarity": [], "Subjectivity": []}

    for file in os.listdir():
        if ".pdf" in file:
            document = an.PDF2text(file)
            cleaned_file = an.cleaning_pipeline(document)
            hiv4 = lm = ps.LM()
            tokens = hiv4.tokenize(cleaned_file)
            score = hiv4.get_score(tokens)

            if len(cleaned_file) != 0:
                #Readibility scores
                read_scores["Length"].append(len(tokens))
                read_scores["NPositive"].append(score['Positive'])
                read_scores["NNegative"].append(score['Negative'])
                read_scores["PPositive"].append(round(score['Positive']/len(tokens),4))
                read_scores["PNegative"].append(round(score['Negative']/len(tokens),4))
                read_scores["Polarity"].append(round(score['Polarity'],4))
                read_scores["Subjectivity"].append(round(score['Subjectivity'],4))
                #Year
                read_scores["Year"].append(int(file[file.find("_")+1:file.find("-")]))
            else:
                read_scores["Length"].append(np.NaN)
                read_scores["NPositive"].append(np.NaN)
                read_scores["NNegative"].append(np.NaN)
                read_scores["PPositive"].append(np.NaN)
                read_scores["PNegative"].append(np.NaN)
                read_scores["Polarity"].append(np.NaN)
                read_scores["Subjectivity"].append(np.NaN)  
                read_scores["Year"].append(int(file[file.find("_")+1:file.find("-")]))
            print(file)
            
    df = pd.DataFrame(read_scores)
    df["Bank"] = bank_name
    df["Report"] = report
    df["Dictionary"] = "Loughran & McDonald"
    return(df)


In [None]:
#Usage of the sentiscore_harvard() function
path = "C:\\Users\\goura\Documents\\000. Research\\Catching the sentiments of the Annual Reports\\Bank"
bank = "Lakshmi Vilas"
choice = int(input("1 or 0: "))
if choice == 1:
    report = "Director Report"
else:
    report = "Auditor Report"
    
sentiscore_harvard(path, bank, report)