# Generate Word Frequency in Audit Reports

## Steps Taken

1. Download the folder with audit reports from OneDrive
2. Iterate through the folder and get the folders with each country's reports
3. For each country's folder, get the country's reports
4. Lemmatize the words in each report and remove non-English words
5. Filter the words to remove noise
6. Get the frequency distribution of each text
7. Enter the frequency distribution into a pandas dataframe
8. Merge each country's frequency distributions into one dataframe with the words as rows and report name as column
9. Enter each country as a spreadsheet in an excel workbook
10. Delete the downloaded documents

In [1]:
# link to audit_reports; publicly accessible
audit_reports_link = "https://stir-my.sharepoint.com/:f:/g/personal/fkc3_stir_ac_uk/Esgp-VMQyzBClY5vpTP9TsYBTCb16iA3NvelLEJM53VEgQ?e=7ejaR3"

### 2. Iterate through the folder and create a dictionary to match each country to files within it

In [2]:
import os
from pdfminer.high_level import extract_text
import nltk
from nltk.corpus import stopwords, words
import pandas as pd

In [3]:
audit_reports_file_path = "./audit_reports"

In [4]:
country_audit_report_dict = {}

for root, dirs, files in os.walk(audit_reports_file_path):
    if (dirs): # ignore the country folders as children
        continue
    country_audit_report_dict[root] = sorted(files);

In [None]:

'''
            if text and country_folder_path in country_texts_dict:
                country_texts_dict[country_folder_path][filename] = text;
            elif text and country_folder_path not in country_texts_dict:
                country_texts_dict[country_folder_path] = {filename: text}
'''
def save_reports_as_text():
    for country_folder_path, filenames in country_audit_report_dict.items():
        for filename in filenames:
            file_path = f'{country_folder_path}/{filename}'
            text_file_path = file_path.replace('.pdf', '.txt') # text path of pdf
            print(f'Doing {file_path}')
            try:
                if file_path.endswith('.txt') or os.path.isfile(text_file_path): continue
                    
                text = extract_text(f'{file_path}').lower()

                with open(text_file_path, 'w') as text_file:
                    text_file.write(text)
            
            except:
                print(f'{filename} has a problem')
                
save_reports_as_text()

Doing ./audit_reports/Zambia/ZAM_CDF_2022.pdf
Doing ./audit_reports/Zambia/ZAM_CDF_2022.txt
ZAM_CDF_2022.txt has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2003.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2003.txt
ZAM_GenGov_2003.txt has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2004.pdf
ZAM_GenGov_2004.pdf has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2004.txt
ZAM_GenGov_2004.txt has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2005.pdf
ZAM_GenGov_2005.pdf has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2005.txt
ZAM_GenGov_2005.txt has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2006.pdf
ZAM_GenGov_2006.pdf has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2006.txt
ZAM_GenGov_2006.txt has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2007.pdf
ZAM_GenGov_2007.pdf has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2007.txt
ZAM_GenGov_2007.txt has a problem
Doing ./audit_reports/Zambia/ZAM_GenGov_2008.pdf
ZAM_GenGov_2008.pdf has a proble

### 3. For each country's folder, get the country's reports

In [None]:
country_texts_dict = {}

for country_folder_path, filenames in country_audit_report_dict.items():
    for filename in filenames:
        print(f'Doing {country_folder_path}/{filename}')
        try:
            text = extract_text(f'{country_folder_path}/{filename}').lower()
        
            if text and country_folder_path in country_texts_dict:
                country_texts_dict[country_folder_path][filename] = text;
            elif text and country_folder_path not in country_texts_dict:
                country_texts_dict[country_folder_path] = {filename: text}
        except:
            print(f'{filename} has a problem')


Doing ./audit_reports/Zambia/ZAM_CDF_2022.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2003.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2004.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2005.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2006.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2007.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2008.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2009.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2010.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2011.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2012.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2013.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2014.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2015.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2016.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2017.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2018.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2019.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2020.pdf
Doing ./audit_reports/Zambia/ZAM_GenGov_2021.pdf
Doing ./audit_reports/Z

### 4. Lemmatize the words in each report and remove non-English words

In [None]:
lemmatizer = nltk.WordNetLemmatizer()

for country_path in country_texts_dict.keys():
    for report_name, report_text in country_texts_dict[country_path].items():
        if report_text:
            words_in_report = nltk.word_tokenize(report_text.lower()) 

            # lemmatize English words only
            words = [lemmatizer.lemmatize(word) for word in words_in_report]

            country_texts_dict[country_path][report_name] = words

### 6. Get the frequency distribution of each text

In [None]:
country_report_freq_dist_dict = {}

for country_path in country_texts_dict.keys():
    for report_name, words in country_texts_dict[country_path].items():
        fdist = nltk.FreqDist(words)
        bigram_fdist = nltk.FreqDist(nltk.bigrams(words))
        trigram_fdist = nltk.FreqDist(nltk.trigrams(words))

        fdist.update(bigram_fdist)
        fdist.update(trigram_fdist)

        if country_path in country_report_freq_dist_dict:
            country_report_freq_dist_dict[country_path][report_name] = fdist
        else:
            country_report_freq_dist_dict[country_path] = {report_name: fdist}

### 5. Filter the words to remove noise

In [None]:
# remove words made up numbers, symbols, and stopwords
def is_ideal_word(word):
    if word.isalpha() and word not in stopwords.words('english'):
        return True
    return False

    
for country_path in country_texts_dict.keys():
    for report_name, words in country_texts_dict[country_path].items():
        words = list(filter(is_ideal_word, words))
        country_texts_dict[country_path][report_name] = words

### 7. Enter the frequency distributions into a pandas dataframe

In [None]:
country_report_df_dict = {}

for country_path in country_report_freq_dist_dict.keys():
    for report_name, freq_dist in country_report_freq_dist_dict[country_path].items():
        df = pd.DataFrame.from_dict(dict(freq_dist), orient='index', columns=[report_name])

        if country_path in country_report_df_dict:
            country_report_df_dict[country_path][report_name] = df
        else:
            country_report_df_dict[country_path] = {report_name: df}

### 8. Merge each country's frequency distributions into one dataframe with the words as rows and report name as column

In [None]:
country_df_dict = {}

for country_path in country_report_df_dict.keys():
    df = pd.DataFrame()
    for report_df in country_report_df_dict[country_path].values():
        if df.empty:
            df = report_df
        else:
            df = pd.merge(df, report_df, 'outer',left_index=True, right_index=True)
    country_df_dict[country_path] = df

In [None]:
country_df_dict['./audit_reports/Nigeria'][:100]

In [None]:
country_df_dict['./audit_reports/Malawi'].loc['transparency']

### 9. Enter each country as a spreadsheet in an excel workbook

In [None]:
with pd.ExcelWriter('word-frequency.xlsx') as writer:
    for country_path, df in country_df_dict.items():
        df.to_excel(writer, sheet_name=f'{country_path.removeprefix("./audit_reports/")}')

In [None]:
d = {'report': 42, 'auditor': 143, 'general': 185}
p = pd.DataFrame.from_dict(d, orient='index', columns=['2022'])
p

In [None]:
zam_2015 = country_report_df_dict['./audit_reports/Zambia']['ZAM_GenGov_2015.pdf']
zam_2015