# Medata Exploration

For each metadata table:
- Read the table
- Create a bar chart with the count of unique values for each column
- Show the table head
- For columns with less than 10 unique values, show the unique values and their counts
- Count the number of missing values for each column
- Generate report
- Read the report
  - Decide which columns to keep
  - Save the selected data into a new csv file.
  - Make a brief description of the table and the columns that were kept.


In [8]:
import pandas as pd
from pathlib import Path
# Configure pandas to use seaborn for plotting
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

def get_tables_paths():
    """
    Get the paths of all csv files in the current directory and its subdirectories
    """
    return list(Path.cwd().rglob('*.csv'))

def load_table(path):
    """
    Load a table from a csv file
    """
    return pd.read_csv(path)

def count_unique_values(table):
    return table.apply(lambda x: x.nunique()).sort_values()

def count_missing_values(table):
    return table.apply(lambda x: x.isnull().sum()).sort_values()

def create_and_save_barchart(data, title):
    data.plot(kind='barh', title=title)
    plt.xlabel('Count')
    plt.ylabel('Column')
    for i, v in enumerate(data):
        plt.text(v, i, str(v), ha='left', va='center')
    plt.tight_layout()
    plt.savefig(f'{title.replace(" ", "_").lower()}.png', dpi=300)



In [9]:
table_paths = get_tables_paths()

for table_path in table_paths:
    table = load_table(table_path)
    print(table_path.name)
    print(count_unique_values(table))
    print(count_missing_values(table))
    

2024-03-15_aact_data_elements_metadata.csv
Unnamed: 0        0
db schema         4
data type         8
table            50
enumerations     67
nlm doc         153
CTTI note       154
source          230
column          250
dtype: int64
db schema         0
table             0
column            0
data type         2
source          109
nlm doc         129
CTTI note       184
enumerations    386
Unnamed: 0      453
dtype: int64
2024-03-15_aact_tables_metadata.csv
Schema             2
Rows per Study     3
Domain             4
Row Count         43
Description       45
Name              51
dtype: int64
Schema            0
Name              0
Row Count         0
Description       5
Rows per Study    6
Domain            7
dtype: int64
2024-03-15_aact_views_functions_metadata.csv
Schema                 1
Example                1
Description           12
Data Returned         12
View/Function name    13
Source Data           13
dtype: int64
Schema                 0
View/Function name     0
Sourc