# Generate Word Frequency in Audit Reports

## Steps Taken

1. Download the folder with audit reports from OneDrive
2. Iterate through the folder and get the folders with each country's reports
3. For each country's folder, get the country's reports
4. Lemmatize the words in each report
5. Get the frequency distribution of each text
6. Filter the words to remove noise
7. Enter the frequency distribution into a pandas dataframe
8. Merge each country's frequency distributions into one dataframe with the words as rows and report name as column
9. Enter each country as a spreadsheet in an excel workbook
10. Delete the downloaded documents

In [1]:
# link to audit_reports; publicly accessible
audit_reports_link = "https://stir-my.sharepoint.com/:f:/g/personal/fkc3_stir_ac_uk/Esgp-VMQyzBClY5vpTP9TsYBTCb16iA3NvelLEJM53VEgQ?e=7ejaR3"

### 2. Iterate through the folder and create a dictionary to match each country to files within it

In [2]:
import os
from pdfminer.high_level import extract_text
import nltk
from nltk.corpus import stopwords, words
import pandas as pd

In [3]:
audit_reports_file_path = "./audit_reports"

In [4]:
country_audit_report_dict = {}

for root, dirs, files in os.walk(audit_reports_file_path):
    if (dirs): # ignore the country folders as children
        continue
    country_audit_report_dict[root] = sorted(files);

In [5]:
def save_reports_as_text():
    for country_folder_path, filenames in country_audit_report_dict.items():
        for filename in filenames:
            file_path = f'{country_folder_path}/{filename}'
            text_file_path = file_path.replace('.pdf', '.txt') # text path of pdf
            try:
                if file_path.endswith('.txt') or os.path.isfile(text_file_path): continue

                print(f'Doing {file_path}')               
                text = extract_text(f'{file_path}').lower()

                with open(text_file_path, 'w') as text_file:
                    text_file.write(text)
            
            except:
                print(f'{filename} has a problem')


### 3. For each country's folder, get the country's reports as text files

In [6]:
country_texts_dict = {}

for country_folder_path, filenames in country_audit_report_dict.items():
    for filename in filenames:
        # print(f'Doing {country_folder_path}/{filename}')
        try:
            if not filename.endswith('.txt'): continue

            with open(f'{country_folder_path}/{filename}') as text:
                # print(filename, text.readline())
                if country_folder_path in country_texts_dict:
                    country_texts_dict[country_folder_path][filename] = text.read();
                    
                elif country_folder_path not in country_texts_dict:
                    country_texts_dict[country_folder_path] = {filename: text.read()}
                    
        except:
            print(f'{filename} has a problem')


### 4. Lemmatize the words in each report

In [7]:
lemmatizer = nltk.WordNetLemmatizer()

for country_path in country_texts_dict.keys():
    for report_name, report_text in country_texts_dict[country_path].items():
        if report_text:
            words_in_report = nltk.word_tokenize(report_text.lower()) 

            words_in_report = [lemmatizer.lemmatize(word) for word in words_in_report]

            country_texts_dict[country_path][report_name] = words_in_report

### 5. Get the frequency distribution of each text

In [16]:
country_report_freq_dist_dict = {}

# remove words with numbers and those that are stopwords
def clean_fdist(fdist):
    return {word: freq for word, freq in fdist.items() if word.isalpha() and word not in set(stopwords.words('english'))}

for country_path in country_texts_dict.keys():
    for report_name, report_words in country_texts_dict[country_path].items():
        fdist = nltk.FreqDist(report_words)
        fdist = clean_fdist(fdist)

        bigram_fdist = nltk.FreqDist(nltk.bigrams(report_words))
        # join the words if the words are all alphabets
        bigram_fdist = {' '.join(key): value for key, value in bigram_fdist.items() if ''.join(key).isalpha()}

        trigram_fdist = nltk.FreqDist(nltk.trigrams(report_words))
        trigram_fdist = {' '.join(key): value for key, value in trigram_fdist.items() if ''.join(key).isalpha()}
        
        fdist.update(bigram_fdist)
        fdist.update(trigram_fdist)
        
        '''

        fdist.update(bigram_fdist)
        fdist.update(trigram_fdist)
        '''

        if country_path in country_report_freq_dist_dict:
            country_report_freq_dist_dict[country_path][report_name] = fdist
        else:
            country_report_freq_dist_dict[country_path] = {report_name: fdist}

### 7. Enter the frequency distributions into one dataframe per country

In [17]:
with pd.ExcelWriter('word-frequency.xlsx', engine='xlsxwriter') as writer:
    for country_path in country_report_freq_dist_dict.keys():
        df = pd.DataFrame()
        print(f'Writing {country_path} to file')
    
        for report_name, freq_dist in country_report_freq_dist_dict[country_path].items():
            if df.empty:
                df = pd.DataFrame.from_dict(dict(freq_dist), orient='index', columns=[report_name])
            else:
                other_df = pd.DataFrame.from_dict(dict(freq_dist), orient='index', columns=[report_name])
                df = pd.merge(df, other_df, 'outer', left_index=True, right_index=True)

        df.to_excel(writer, sheet_name=f'{country_path.removeprefix("./audit_reports/")}')
        print(f'Completed writing to {country_path}')

Writing ./audit_reports/Zambia to file
Completed writing to ./audit_reports/Zambia
Writing ./audit_reports/South-Africa to file
Completed writing to ./audit_reports/South-Africa
Writing ./audit_reports/Nigeria to file
Completed writing to ./audit_reports/Nigeria
Writing ./audit_reports/Kenya to file
Completed writing to ./audit_reports/Kenya
Writing ./audit_reports/Ghana to file
Completed writing to ./audit_reports/Ghana
Writing ./audit_reports/Malawi to file
Completed writing to ./audit_reports/Malawi
Writing ./audit_reports/Tanzania to file
Completed writing to ./audit_reports/Tanzania
Writing ./audit_reports/Uganda to file
Completed writing to ./audit_reports/Uganda
