### Exploratory Data Analysis: 


### SetUp

Import datasets into memory

In [1]:
import pandas as pd
import os

# File name 
nssec_data = 'NS-SEC_2021.xlsx'

# Path
path = 'C:\\Users\\natha\\.cache\\kagglehub\\datasets\\nathanhg\\uk-gam-datasets\\versions\\1'

# Read files into memory 

nsec_df = pd.read_excel(os.path.join(path,nssec_data))


In [2]:
# Formatting columns
nsec_df.columns = nsec_df.columns.map(lambda col: col.replace(' ','_').lower())
nsec_df.columns

Index(['onsconstid', 'constituencyname', 'regnationid', 'regnationname',
       'natcomparator', 'variables', 'groups', 'con_num', 'con_pc', 'rn_pc',
       'nat_pc', 'ranking_total', 'rank'],
      dtype='object')

### Data Profiling

In [3]:
nsec_df.head()

Unnamed: 0,onsconstid,constituencyname,regnationid,regnationname,natcomparator,variables,groups,con_num,con_pc,rn_pc,nat_pc,ranking_total,rank
0,E14000530,Aldershot,E12000008,South East,England & Wales,"Higher managerial, administrative and professi...","Managerial, administrative and professional oc...",11636,0.128602,0.156598,0.130829,0.337408,253
1,E14000530,Aldershot,E12000008,South East,England & Wales,"Lower managerial, administrative and professio...","Managerial, administrative and professional oc...",18893,0.208806,0.218946,0.199105,0.337408,253
2,E14000530,Aldershot,E12000008,South East,England & Wales,Intermediate occupations,Intermediate occupations,12310,0.136051,0.120724,0.114306,0.23038,245
3,E14000530,Aldershot,E12000008,South East,England & Wales,Small employers and own account workers,Intermediate occupations,8535,0.094329,0.116142,0.105661,0.23038,245
4,E14000530,Aldershot,E12000008,South East,England & Wales,Lower supervisory and technical occupations,Routine and manual occupations,5617,0.062079,0.050808,0.053959,0.303069,254


##### Overview

In [4]:
# from ydata_profiling import ProfileReport


# profile = ProfileReport(nsec_df, title="Pandas Profiling Report", type_schema={
#         "onsconstid": "categorical",
#         "constituencyname": "categorical",
#         "regnationid": "categorical",
#         "regnationname": "categorical",
#         "natcomparator": "categorical",
#         "groups":"categorical",
#         "variables": "categorical",
#         "con_num": "numeric",
#         "con_pc": "numeric",
#         "rn_pc": "numeric",
#         "nat_pc": "numeric",
#         "ranking_total":"numeric",
#         "rank":"numeric"
        
#     })
# profile

❔ What are the key metrics?
+ 💡Con_num is the constituency population as an int

### Create a View (Reshape)


In [5]:
s = 'gambling-premises-data.silver_ew.clean_national_socio_economic_class'
s2 = 'Not found: Table gambling-premises-data:silver_ew.src_national_socio_economic_class was not found in location europe-west2.'
s3 = 'Not found: Table gambling-premises-data:silver_ew.clean_national_socio_economic_class was not found in location europe-west2.'

variables
Higher managerial, administrative and professional occupations    573
Lower managerial, administrative and professional occupations     573
Intermediate occupations                                          573
Small employers and own account workers                           573
Lower supervisory and technical occupations                       573
Semi-routine occupations                                          573
Routine occupations                                               573
Never worked / long-term unemployed                               573
Full-time students                                                573
Name: count, dtype: int64

In [6]:
nsec_df['group_variable'] = nsec_df.apply(
    lambda row: row['groups'] if row['groups'] == row['variables'] else f"{row['groups']}_{row['variables']}",
    axis=1
)

In [7]:
nsec_df_piv = nsec_df.pivot(index=["onsconstid", "constituencyname"],columns=["group_variable"], values='con_num').reset_index()

Profile this view

In [8]:

# from ydata_profiling import ProfileReport


# profile = ProfileReport(nsec_df_piv, title="Pandas Profiling Report")
# profile

Use SQL with DBT for the following str manipulation

In [9]:
nsec_df_piv.columns = nsec_df_piv.columns.str.replace(',', '_').str.replace('/','_').str.replace(' ', '_').str.replace('-', '_').str.lower()


##### SQL Implementation

In [10]:
# Defining a uniqueness test

def test_distinct_id(df,id):
    """
    Function for testing uniqueness of a field within a dataframe
    params:
    df(pd.Dataframe): The dataframe being tested
    id: The field being tested for uniqueness
    """
    value_counts = df[id].value_counts().reset_index()
    value_counts.columns = [id, 'count']
    duplicates= value_counts[value_counts['count']>1]
    if duplicates.empty:
        print(f'{id} is distinct')
        return None
    else:
        print('Duplicate IDs found:')
        print(duplicates)
        return None

In [11]:
import duckdb 

# Create dim_accounts table  
da_query =  """ 
SELECT  
    onsconstid as constituencyid,
    constituencyname,
    full_time_students, 
    intermediate_occupations,
    intermediate_occupations_small_employers_and_own_account_workers,
    managerial__administrative_and_professional_occupations_higher_managerial__administrative_and_professional_occupations,
    managerial__administrative_and_professional_occupations_lower_managerial__administrative_and_professional_occupations,
    never_worked___long_term_unemployed,
    routine_and_manual_occupations_lower_supervisory_and_technical_occupations,
    routine_and_manual_occupations_routine_occupations,
    routine_and_manual_occupations_semi_routine_occupations
FROM nsec_df_piv
"""
dim_nsec = duckdb.query(da_query).to_df()

# Unique ID test

test_distinct_id(dim_nsec,'constituencyid')


constituencyid is distinct


In [12]:
dim_nsec.describe()

Unnamed: 0,full_time_students,intermediate_occupations,intermediate_occupations_small_employers_and_own_account_workers,managerial__administrative_and_professional_occupations_higher_managerial__administrative_and_professional_occupations,managerial__administrative_and_professional_occupations_lower_managerial__administrative_and_professional_occupations,never_worked___long_term_unemployed,routine_and_manual_occupations_lower_supervisory_and_technical_occupations,routine_and_manual_occupations_routine_occupations,routine_and_manual_occupations_semi_routine_occupations
count,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0,573.0
mean,6503.387435,9688.34904,8955.591623,11088.855148,16875.739965,7214.572426,4573.465969,10226.813264,9631.280977
std,4869.217877,1678.78043,2469.321916,5011.366438,3662.285527,3076.346716,965.056017,3214.569244,1916.699857
min,2134.0,4379.0,3794.0,3090.0,8335.0,3222.0,1969.0,3874.0,4742.0
25%,3889.0,8571.0,7168.0,7316.0,14139.0,5067.0,3986.0,7972.0,8375.0
50%,4678.0,9745.0,8754.0,10178.0,16975.0,6356.0,4597.0,9937.0,9668.0
75%,7160.0,10774.0,10566.0,13703.0,19270.0,8424.0,5154.0,12242.0,10904.0
max,34238.0,15258.0,17798.0,31398.0,25676.0,23567.0,7594.0,25072.0,18260.0
