In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from urllib import parse
from nltk.corpus import cmudict,stopwords
from utilities import clean_text,report_constraints,score,category,syllable_count,readability,personal_pronoun,PassiveWords

In [2]:
#list of stopwords(Generic)
stopword_gen=pd.read_csv("StopWords_Generic.txt")
stopword_gen=list(stopword_gen.ABOUT)
#create positive negative dictionary from master dictionary
df=pd.read_csv("LoughranMcDonald_MasterDictionary_2018.csv")
df=df.loc[:,["Word","Negative","Positive"]]
df=df[(df.Negative!=0) | (df.Positive!=0)]
df=df[~df.Word.isin(stopword_gen)].reset_index(drop=True)

In [3]:
#read contraint and uncertainty files
const_file=pd.read_excel("constraining_dictionary.xlsx",sheet_name=0)
uncert_file=pd.read_excel("uncertainty_dictionary.xlsx",sheet_name=0)
constraints=[word.lower() for word in list(const_file.Word)]
uncertainty=[word.lower() for word in list(uncert_file.Word)]

In [4]:
abs_url="https://www.sec.gov/Archives/"
sections=[" MANAGEMENTS DISCUSSION AND ANALYSIS", " QUANTITATIVE AND QUALITATIVE DISCLOSUR(?:ES|E) ABOUT MARKET RISK", " RISK FACTORS"]

In [None]:
df1=pd.read_excel("cik_list.xlsx",sheet_name=0)
exl=list(df1["SECFNAME"])
row=[]
count=0
for link in exl:
    count+=1
    temp=[]
    rel_url=link
    url=parse.urljoin(abs_url,rel_url)
    text=requests.get(url)
    
    #parse html using beautiful soup
    soup=BeautifulSoup(text.content,"lxml")
    text=soup.get_text()    
    cleaned_text=clean_text(text)
    
    #number of constraints in the report
    cleaned_text_=cleaned_text.lower()
    total_const=report_constraints(cleaned_text_,constraints)
    
    for section in sections:
        #print(section)
        pattern=r"\. (?:ITEM|Item) \."+section+r"(.+?)(?:ITEM|Item) \."
        match=re.compile(pattern,flags=re.DOTALL)
        substring=match.search(cleaned_text)
        try:
            substring=substring.group(1)
            split_words=substring.split()
            #remove periods from the list
            clean_list=list(map(lambda x:re.sub(r"[^a-zA-Z]","",x),split_words))
            clean_list=list(filter(None,clean_list))
            #polarity 
            pos,neg,pol_score,sub_score=score(clean_list,df,stopword_gen)
            #category
            categ=category(pol_score)
            #syllable count
            d=cmudict.dict()
            syllables=list(map(lambda x:syllable_count(x,d),clean_list))
            #average syllable count
            avg_syllable=sum(syllables)/len(clean_list)
            #complex word count
            complex_words=sum(i>2 for i in syllables)
            #fog index and average sentence length
            fog_id,sent_len,frac=readability(substring,clean_list,complex_words)
            #word count
            #nltk stopwords
            stopword=set(stopwords.words("english"))
            word_count=sum(word not in stopword for word in clean_list)
            #personal pronouns
            p_pronoun=personal_pronoun(substring)
            #passive words
            pass_word=PassiveWords(substring)
            #average word length
            avg_word_length=sum(list(map(lambda x:len(x),clean_list)))/len(clean_list)
            #constraint and uncertainty scores
            const_score=sum(word in constraints for word in clean_list)
            uncert_score=sum(word in uncertainty for word in clean_list)
            #proportions
            pos_prop=pos/word_count
            neg_prop=neg/word_count
            const_prop=const_score/word_count
            uncert_prop=uncert_score/word_count
            temp.extend([pos,neg,pol_score,sent_len,frac,fog_id,complex_words,word_count,uncert_score,const_score,pos_prop,neg_prop,uncert_prop,const_prop])
            temp=[round(num,2) for num in temp]
        except (AttributeError,ZeroDivisionError):
            temp.extend(float("NaN") for _ in range(14))
    temp.append(total_const)  
    row.append(temp)

In [None]:
#read column names from output file
column_df=pd.read_excel("Output Data Structure.xlsx",header=None)
column=list(column_df.iloc[0,6:])

#merge two dataframes 
df2=pd.DataFrame(row,columns=column)
result=pd.concat([df1, df2], axis=1)

In [None]:
#convert csv file to xlsx
writer=pd.ExcelWriter('output.xlsx')
result.to_excel(writer, index = False)
writer.save()

In [None]:
"""
rel_url="edgar/data/4962/0001193125-14-167067.txt"
url=parse.urljoin(abs_url,rel_url)
text=requests.get(url)
    
#parse html using beautiful soup
soup=BeautifulSoup(text.content,"lxml")
for tag in soup(["sec-header"]):
    tag.decompose()
text=soup.get_text()
cleaned_text=clean_text(text)
print(text)
#print(cleaned_text)
sections=[" MANAGEMENTS DISCUSSION AND ANALYSIS", " QUANTITATIVE AND QUALITATIVE DISCLOSUR(?:ES|E) ABOUT MARKET RISK", " RISK FACTORS"]
pattern=r"\. (?:ITEM|Item) [A-Za-z]?\."+sections[0]+r"(.+?)(?:ITEM|Item) [A-Za-z]?\."
match=re.compile(pattern,flags=re.DOTALL)
substring=match.search(cleaned_text)
substring=substring.group(1)
#print(substring)
"""