In [3]:
# These session notes from Will may provide answers if some Python functionality provided in Teams does not work...

# session_1_data_wrangling.py
# https://github.com/data-to-insight/ERN-sessions/blob/main/No%20Local%20Python/session_1_data_wrangling.py

# session_1_making_it_an_app.py
# https://github.com/data-to-insight/ERN-sessions/blob/main/No%20Local%20Python/session_1_making_it_an_app.py

# requirements:  plotly

In [4]:
# list comprehension
 
lc_evens = [num for num in range(1, 101) if (num % 2 == 0) & (num % 3 == 0)]
print(lc_evens)
 
# slow, take a lot of lines
evens = []
for num in range(1, 101):
    if num % 2 == 0:
        evens.append(num)
 
print(evens)

[6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96]
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100]


In [5]:
import pandas as pd
import glob
import sys

In [6]:
# The tricky bit here is accessing a folder full of csv files that are stored in one of Will's Github folders (in the cloud).
# One working solution is to copy all the csv files to an MN Github folder.
# This path below works, it is a reference to an MN folder in Github.
path = r'/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9'

# The glob.glob() function does not work in a web location (absolute reference starting https:), so don't try to make this work.

# Do I have to copy the csv files to an MN Github folder?
# To make sure we have these csv files safely kept, # copying them is a good idea anyway.

# But can we get this working using Will's Github folder?
# path = r'/workspaces/data-to-insight/ERN-sessions/No Local Python/data'
# I haven't been able to make this work so far.
# Advice from Ben/Declan is that you do have to copy the csv files to your own folder.
 
files = glob.glob(path + "/*.csv")

print(files)

['/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/c4_children_in_need_section_47s_and_icpcs_2013_to_2024.csv', '/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/d1_child_protection_plans_2013_to_2024.csv', '/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/d6_cpps_reviewed_within_timescales_2013_to_2024.csv', '/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/a4_cin_by_ethnicity_and_age_gender_2018_to_2024.csv', '/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/d5_cpps_at31march_by_duration_2013_to_2024.csv', '/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/ons_mid-year_population_estimates_2012_to_2023.csv', '/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/c2_children_in_need_assessments_duration_2014_to_2024.csv', '/workspaces/Python_ESCC_course/python_tutorial_code/course_2/data_mod2_8-9/d3_cpps_subsequ

In [7]:
dfs = {}
 
for f in files:
    df = pd.read_csv(f)
 
    key_string = f.split("/")[-1][:-17]

    dfs[key_string] = df
 
print(dfs)
print(key_string)

{'c4_children_in_need_section_47s_and_icpcs':       time_period time_identifier geographic_level country_code country_name  \
0            2024  Reporting year         National    E92000001      England   
1            2024  Reporting year         Regional    E92000001      England   
2            2024  Reporting year         Regional    E92000001      England   
3            2024  Reporting year         Regional    E92000001      England   
4            2024  Reporting year         Regional    E92000001      England   
...           ...             ...              ...          ...          ...   
1962         2013  Reporting year  Local authority    E92000001      England   
1963         2013  Reporting year  Local authority    E92000001      England   
1964         2013  Reporting year  Local authority    E92000001      England   
1965         2013  Reporting year  Local authority    E92000001      England   
1966         2013  Reporting year  Local authority    E92000001      Engla

In [8]:
dfs = {key:dfs[key] for key in sorted(dfs.keys())}

In [9]:
left_df = dfs['b1_children_in_need']
# left_df = dfs['b1_children_in_need_2013_to_2024']
merge_cols = list(left_df.columns[:10])

new_col_names = [f'b1_children_in_need_{col}' if (not col in merge_cols) else col for col in dfs['b1_children_in_need'].columns]

In [10]:
left_df = left_df.set_axis(new_col_names, axis=1)

for key, df in dfs.items():
    if (('headline_figures' not in key) &
        ('mid-year' not in key) &
        ('b1' not in key) &
        (key[0] != 'a')):
 
        df = df.set_axis([f'{key}_{col}' if (not col in merge_cols) else col for col in df.columns], axis=1)
        # df = df.set_axis([f'{key}_{col}' if (not col in merge_cols) else col for col in df.columns])

        left_df = left_df.merge(df, how='left', on=merge_cols)
left_df.to_csv('merged_cin.csv', index=False)
print(dfs)

{'a1_cin_assessments_episodes_referrals':      time_period time_identifier geographic_level country_code country_name  \
0           2024  Reporting year         National    E92000001      England   
1           2024  Reporting year         National    E92000001      England   
2           2024  Reporting year         National    E92000001      England   
3           2024  Reporting year         National    E92000001      England   
4           2024  Reporting year         National    E92000001      England   
..           ...             ...              ...          ...          ...   
175         2013  Reporting year         National    E92000001      England   
176         2013  Reporting year         National    E92000001      England   
177         2013  Reporting year         National    E92000001      England   
178         2013  Reporting year         National    E92000001      England   
179         2013  Reporting year         National    E92000001      England   

         

In [None]:
# This cell isn't working properly

# Go back to previous session where we used Streamlit, it took a while to get that working but it was successful.

# in terminal run:   pip install streamlit
# Warning: to view this Streamlit app on a browser, run it with the following command:
#     streamlit run /home/codespace/.local/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]

import streamlit as st
import pandas as pd

st.title('Benchmarking data pipeline')

# We can use
files = st.file_uploader('Please upload benchmarking data', 
                         accept_multiple_files=True)

# Set up an empty dictionary to store our dataframes in




In [12]:
dfs = {}

if files:
    for f in files:
        df = pd.read_csv(f)
 
        key_string = f.name.split("/")[-1][:-17]
 
        dfs[key_string] = df
 
    dfs = {key:dfs[key] for key in sorted(dfs.keys())}
 
    left_df = dfs['b1_children_in_need']
    merge_cols = list(left_df.columns[:10])
 
    new_col_names = [f'b1_children_in_need_{col}' if (not col in merge_cols) else col for col in dfs['b1_children_in_need'].columns]
    left_df = left_df.set_axis(new_col_names, axis=1)
 
    for key, df in dfs.items():
        if (('headline_figures' not in key) &
            ('mid-year' not in key) &
            ('b1' not in key) &
            (key[0] != 'a')):
            df = df.set_axis([f'{key}_{col}' if (not col in merge_cols) else col for col in df.columns], axis=1)
            left_df = left_df.merge(df, how='left', on=merge_cols)

In [13]:
def convert_df(df):
    return df.to_csv().encode("utf-8")
 
st.title('CIN benchmarking pipeline')
 
files = st.file_uploader(label='Please upload CIN data',
                 accept_multiple_files=True)
 
dfs = {}
 
if files:
    for f in files:
        df = pd.read_csv(f)
 
        key_string = f.name.split("/")[-1][:-17]
 
        dfs[key_string] = df
 
    dfs = {key:dfs[key] for key in sorted(dfs.keys())}
 
    left_df = dfs['b1_children_in_need']
    merge_cols = list(left_df.columns[:10])
 
    new_col_names = [f'b1_children_in_need_{col}' if (not col in merge_cols) else col for col in dfs['b1_children_in_need'].columns]
    left_df = left_df.set_axis(new_col_names, axis=1)
 
    for key, df in dfs.items():
        if (('headline_figures' not in key) &
            ('mid-year' not in key) &
            ('b1' not in key) &
            (key[0] != 'a')):
 
            df = df.set_axis([f'{key}_{col}' if (not col in merge_cols) else col for col in df.columns], axis=1)
           
            left_df = left_df.merge(df, how='left', on=merge_cols)
 
    wide_csv = convert_df(left_df)
 
    st.download_button(label='Click to download wide merged data',
                       data=wide_csv,
                       file_name='wide_benchamrking.csv',
                       mime="text/csv")

