In [None]:
import io
import logging
import pyarrow.dataset as ds
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from typing import Dict, List
from bs4 import BeautifulSoup

In [None]:
logging.basicConfig(filename = 'log.txt', level = logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')

In [None]:
PATH_DATA = Path(r'C:/Git/HonoursProject/ipw-classifier/ipw_classifier/data/')
PARQUET_SUFFIX = '.parquet'

In [None]:
def _parse(table_name: str) -> pd.DataFrame:
    parquet_path = PATH_DATA / f"{table_name}{PARQUET_SUFFIX}"
    table = ds.dataset(parquet_path).to_table()
    df = table.to_pandas()

    logging.info(f'Number of records in table {table_name}: {len(df)}')

    if 'case_id' in df.columns:
        if not df['case_id'].is_unique:
            raise ValueError(f'Duplicate values found in "case_id" column in table {table_name}.')
        
        df.rename(columns = {'id': f'{table_name}_id'}, inplace = True)
    
    columns_to_drop = set([
        'author',
        'rights', 
        'created',
        'updated',
        'deleted',
        'owners',
        'source',
        'closed'
        ])
    columns = columns_to_drop.intersection(set(df.columns))  
    
    for col_name in columns:
        df.drop(col_name, axis = 1, inplace = True)
    return df

In [None]:
def main():
    case = _parse('case')
    situation = _parse('situation')
    plan = _parse('plan')
    
    df_sit_pln = plan.merge(situation, left_on = 'case_id', right_on = 'case_id', how = 'outer', suffixes = ('_pln', '_sit'))
    df = df_sit_pln.merge(case, left_on = 'case_id', right_on = 'id', how = 'left')
    logging.info(f'Number of records in combined table: {len(df)}')
    df = df[df['status'] == 'closed']
    logging.info(f'Number of closed records in combined table: {len(df)}')
    
    df.drop('case_id', axis = 1, inplace = True)
    return df

In [None]:
df_in = main()

In [None]:
to_drop = [
    'plan_id',
    'situation_id',
    'title',
    'status',
    'collection_id',
    'author_id'
]

df = df_in.drop(to_drop, axis = 1)
df = df.set_index('id')

In [None]:
def clean_string(string:str) -> str:
    returnvalue = ''
    if string is not None and not isinstance(string, float):
        
        # parse html
        soup = BeautifulSoup(string, 'html.parser')
        returnvalue = soup.getText()
        
        # remove '\n'
        returnvalue = returnvalue.replace('\\n', '')
    return returnvalue

In [None]:
for column in df.columns:  
    df[column] = df[column].apply(clean_string)

In [None]:
# Create a new DataFrame with the length of each string field in words per record  
df_stats = df.apply(lambda x: x.fillna('').str.split().apply(len)) 
summary_stats = df_stats.describe()
for col in df_stats.columns:
    summary_stats.loc['empty', col] = df_stats[col].value_counts(sort = False).get(0, 0)
    summary_stats.loc['not_empty', col] = summary_stats.loc['count', col] - summary_stats.loc['empty', col]
display(summary_stats)

In [None]:
# Create a box and whisker plot
plt.boxplot(df_stats)
plt.xticks(rotation=45)
plt.xticks(range(1, len(df_stats.columns) + 1), df_stats.columns)
plt.show()

In [None]:
def word_count(string:str, returnvalue: Dict[str, int]) -> Dict[str, int]:
    words = string.split()
    for word in words:
        key = word.lower().strip()
        if key in returnvalue:
            returnvalue[key] += 1
        else:
            returnvalue[key] = 1
    return returnvalue

In [None]:
words_dict = {}

for column in df:
    for index, row in df.iterrows():
            words_dict = word_count(row[column], words_dict)

In [None]:
len(words_dict)

In [None]:
words_dict

In [None]:
#calculate the optimal distribution of bins according to Sturge
def bins_sturge(data: List[int]) -> np.ndarray[int]:
    k = int(np.ceil(np.log2(len(data))) + 1)
    returnvalue = np.linspace(min(data), max(data), k) 
    return returnvalue

#calculate the optimal distribution of bins according to Freedman-Draconis
def bins_freedman_draconis(data: List[int]) -> np.ndarray[int]:
    iqr = np.percentile(data, 75) - np.percentile(data, 25)
    bin_width = (2 * iqr) / (len(data) ** (1 / 3))
    returnvalue = np.arange(min(data), max(data), bin_width)
    return returnvalue

In [None]:
# create a list of log-transformed frequencies  
data = list(words_dict.values())
  
# create bins for the histogram  
#bins = np.exp(bins_sturge(np.log(data)))
bins = np.exp(bins_freedman_draconis(np.log(data)))

# create the histogram  
alpha = 1
plt.hist(data, bins=bins, align='left', color = 'blue', alpha = alpha)
  
# add labels and title to the chart  
plt.xlabel("Frequency (log scale)")  
plt.ylabel("Occurrences (log scale)")  
plt.title("Word Frequency Histogram")  
  
# set the axis to a logarithmic scale  
plt.xscale('log')  
plt.yscale('log')  

# display the chart  
plt.show()

