Imported Libaries

In [1]:
import pandas as pd
import chardet
import os
from scipy.stats import zscore
pd.options.mode.chained_assignment = None
from sqlalchemy import create_engine, text, inspect
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from IPython.display import display as original_display
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.colors as mcolors
import numpy as np
import seaborn as sns
import inspect
import re
import string
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

Standard Functions

In [2]:
#Function to clean labels in any plot functions
def clean_label(label):
    return label.replace('_', ' ').title()
 
#Function for getting the name of a Dataframe
def get_var_name(var):
    for name, value in globals().items():
        if value is var:
            return name
 
#Function to validate the data in a Dataframe
def validate_data(df, show_counts=True):
    df_name = get_var_name(df)
    print(f'#########################################################################################################################################################################################\nDataFrame: {df_name}')
    #snapshot the dataset
    display(df)
    #check for unique values
    unique_counts = pd.DataFrame(df.nunique())
    unique_counts = unique_counts.reset_index().rename(columns={0:'No. of Unique Values', 'index':'Field Name'})
    print("Unique values per field:")
    pd.set_option('display.max_rows', None)
    display(unique_counts)
    pd.reset_option('display.max_rows')
    #checking for duplicates
    duplicate_count = df.duplicated().sum()
    print("\nNumber of duplicate rows:")
    print(duplicate_count,'\n')
    info = df.info(show_counts=show_counts)
    display(info)
    #summary stats
    print("\nSummary statistics:")
    display(df.describe())
    print('End of data validation\n#########################################################################################################################################################################################\n')
 
#Function to provide list for data sources as a dataframe when conducting analysis
def header_list(df):
    df_list_ = df.copy()
    df_list = df_list_.columns.tolist()
    df_list = pd.DataFrame(df_list)
    new_header = df_list.iloc[0]  # Get the first row for the header
    df_list = df_list[1:]  # Take the data less the header row
    df_list.columns = new_header  # Set the header row as the df header
    df_list.reset_index(drop=True, inplace=True)  # Reset index
   
    return df_list
 
def query_data(schema, data):
    # Define the SQL query
    query = f'SELECT * FROM [{schema}].[{data}]'
 
    # Load data into DataFrame
    df = pd.read_sql(query, engine)
 
    print(f'Successfully imported {data}')
    # Display the DataFrame
    return df

def display(df):
    # Attempt to get the name of the DataFrame from the caller's local variables
    frame = inspect.currentframe().f_back
    # Attempt to find the variable name corresponding to the DataFrame
    name = "Unnamed DataFrame"
    for var_name, var_value in frame.f_locals.items():
        if var_value is df:
            name = var_name
            break
 
    # If the name is not in the list to be excluded, print it
    if name not in {'df', 'Unnamed DataFrame', 'unique_counts'}:
        print(f"DataFrame: {name}")
    # Always display the DataFrame regardless of the name
    original_display(df)

def unique_values(df, display_df=True):
    # Extract unique values for each field and store them in a dictionary
    unique_values = {col: df[col].unique() for col in df.columns}
    # Find the maximum number of unique values
    max_length = max(len(values) for values in unique_values.values())
    # Create a dictionary for the new DataFrame with padded None values
    unique_df_data = {}
    for col, values in unique_values.items():
        unique_df_data[col] = list(values) + [None] * (max_length - len(values))
    # Create the new DataFrame
    unique_df = pd.DataFrame(unique_df_data)
    if display_df == True:
        # Set display options to show all rows and display the DataFrame
        pd.set_option('display.max_rows', None)
        display(unique_df.head(100))
        # Reset display options back to default
        pd.reset_option('display.max_rows')

def export_to_csv(df):
    df_name = get_var_name(df)
    if df_name is None:
        df_name = input('Dataframe not found in global variables. Please enter a name for the DataFrame: ')
    # Specify the directory and filename
    directory = r"C:\Users\jf79\OneDrive - Office Shared Service\Documents\H&F Analysis\Python CSV Repositry"
    file_path = f'{directory}\\{df_name}.csv'
    # Export the DataFrame to the specified directory
    df.to_csv(file_path, index=False)
    print(f'Successfully exported {df_name} to CSV')

In [145]:
def apply_features(df,date='count_date',**kwargs):
    
    df[f'{date}'] = pd.to_datetime(df[f'{date}'])
    df['year'] = df[f'{date}'].dt.year
    df['day_name'] = df[f'{date}'].dt.dayofweek

    day_dict = {
        '0':['Monday','Weekday'],
        '1':['Tuesday','Weekday'],
        '2':['Wednesday','Weekday'],
        '3':['Thursday','Weekday'],
        '4':['Friday','Weekday'],
        '5':['Saturday','Weekend'],
        '6':['Sunday','Weekend']
    }
    df['day_name'] = df['day_name'].astype(str)
    df['week_name'] = df['day_name'].map(lambda x: day_dict[x][1])
    df['day_name'] = df['day_name'].map(lambda x: day_dict[x][0])

    time = kwargs.get('time', False)
    if time:
        try:
            time_dict = {
                '00-03':'6pm-6am',
                '03-06':'6pm-6am',
                '06-09':'6am-6pm',
                '09-12':'6am-6pm',
                '12-15':'6am-6pm',
                '15-18':'6am-6pm',
                '18-21':'6pm-6am',
                '21-24':'6pm-6am'
            }

            df['day_night'] = df[f'{time}'].map(time_dict)
        except KeyError as e:
            print(f'Invalid time column: {e}')
            pass

    return df

def transform_to_daynight(df, **kwargs):
    category = kwargs.get('category',False)
    index = ['count_date','year','day_name','week_name']
    if category:
        index = index + [category]
    transform = df.pivot_table(
        index =index,
        columns='day_night',
        values='corrected_value_total'
    ).reset_index()

    return transform

def detect_anomalies(df,**kwargs):
    used_keys = {
        'footfall_type','day_night',
        'agg','std','primary_key'
    }
    redundant_kwargs = set(kwargs.keys()) - used_keys
    if redundant_kwargs:
        print(f'Redundant kwargs: {redundant_kwargs}')
        return

    kwargs = {key: kwargs.get(key, f'default_value_{key}') for key in used_keys}
    
    footfall_type = kwargs.get('footfall_type')
    agg = kwargs.get('agg')
    categories = [
        'count_date','year',f'{footfall_type}_{agg}',
        'zscore','is_anomaly?'
    ]

    keywords = [
        kwargs.get('primary_key'),
        'day_night' if kwargs.get('day_night') else None
    ]
    if None in keywords:
        keywords.remove(None)
    for keyword in keywords:
        if keyword:
            if type(keyword) is not list:
                keyword = [keyword]
            for word in keyword:
                categories = categories + [f'{word}']
    

    std = kwargs.get('std', 3)
    anomalies = df.copy()
    anomalies['zscore'] = anomalies.groupby(keywords)[f'{footfall_type}_{agg}'].transform(zscore)
    anomalies['is_anomaly?'] = (anomalies['zscore'] < -std) | (anomalies['zscore'] > std)
    anomalies = anomalies[categories]
    anomalies['moving_average'] = anomalies.groupby(keywords)[f'{footfall_type}_{agg}'].transform(lambda x: x.rolling(window=7).mean())
    
    # Assigning the Corrected values
    anomalies['corrected_value'] = np.where(
        anomalies['is_anomaly?'],
        anomalies['moving_average'],anomalies[f'{footfall_type}_{agg}']
    )
    anomalies['corrected_ma_monthly'] = anomalies.groupby(keywords)['corrected_value'].transform(lambda x: x.rolling(window=30).mean())
    anomalies['corrected_ma_weekly'] = anomalies.groupby(keywords)['corrected_value'].transform(lambda x: x.rolling(window=7).mean())    
    return anomalies

def agg_footfall_data(df,**kwargs):
    used_keys = {
        'category','day_night',
        'agg', 'footfall_type'
    }
    redundant_kwargs = set(kwargs.keys()) - used_keys
    if redundant_kwargs:
        print(f'Redundant kwargs: {redundant_kwargs}')
        return
    unused_keys = set(used_keys) - set(kwargs.keys())
    if unused_keys:
        print(f'Missing kwargs: {unused_keys}\nThese args will be set to default values')

    day_night = kwargs.get('day_night')
    if day_night:
        df = apply_features(df, time='time_indicator')
    merge_list = ['count_date']
    new_categories = [
        'count_date','corrected_ma_monthly',
        'corrected_ma_weekly','corrected_value'
    ]
    
    category = kwargs.get('category')
    
    keywords = [
        kwargs.get('category'),
        'day_night' if day_night else None
    ]
    for keyword in keywords:
        if keyword:
            if not isinstance(keyword, list):
                keyword = [keyword]
            for word in keyword:
                merge_list = [f'{word}'] + merge_list
                new_categories = new_categories + [f'{word}']
    
    agg = kwargs.get('agg','sum')
    agg_data = df.groupby(merge_list + ['year']).agg(
        residents_sum = ('resident',f'{agg}'),
        workers_sum = ('worker',f'{agg}'),
        visitors_sum = ('visitor',f'{agg}'),
        loyalty = ('loyalty_percentage','mean'),
        dwell_time = ('dwell_time',f'{agg}')
    )

    agg_data = agg_data.reset_index()
    agg_data = agg_data.sort_values(
        ['count_date'],
        ascending=False
    )

    footfall_type = ['residents','workers','visitors']
    anomalies = {}
    i = 0
    for footfall in footfall_type:
        i = i + 1
        anomalies[f'{footfall}_z'] = detect_anomalies(agg_data,footfall_type=footfall,std=2.6,primary_key=category,day_night=day_night,agg=agg)
        if i > 2:
            new_categories = new_categories + ['year']
        anomalies[f'{footfall}_z'] = anomalies[f'{footfall}_z'][new_categories]

    footfall_data = pd.merge(
        anomalies['residents_z'], anomalies['workers_z'],
        how='left', on=merge_list,
        suffixes=['_residents','_workers']
    ).merge(
        anomalies['visitors_z'],
        how='left', on=merge_list,
    ).rename(columns={
            'corrected_value':'corrected_value_visitors',
            'corrected_ma_monthly':'corrected_ma_monthly_visitors',
            'corrected_ma_weekly':'corrected_ma_weekly_visitors'
        }
    )

    footfall_type = kwargs.get('footfall_type', ['residents','workers','visitors'])
    for footfall in footfall_type:
        footfall_data['corrected_value_total'] = 0
        footfall_data['corrected_value_total'] = footfall_data['corrected_value_total'] + footfall_data[f'corrected_value_{footfall}']
    
    footfall_data['corrected_value_total'].fillna(0, inplace=True)
    footfall_data['corrected_ma_monthly_total'] = footfall_data.groupby(keywords)['corrected_value_total'].transform(lambda x: x.rolling(window=30).mean())
    footfall_data['corrected_ma_weekly_total'] = footfall_data.groupby(keywords)['corrected_value_total'].transform(lambda x: x.rolling(window=7).mean())

    return footfall_data



In [169]:
def typical_footfall(footfall_data, start, end, **kwargs):

    columns = [
        'OID_','Col_ID','Row_ID','Hex_ID',
        'Centroid_X','Centroid_Y','area',
        'Shape_Length','Shape_Area'
    ]
    for column in columns:
        if column in footfall_data.columns:
            footfall_data = footfall_data.drop(columns=column)

    footfall_data['count_date'] = pd.to_datetime(footfall_data['count_date'])

    footfall_data = footfall_data[
        (footfall_data['count_date'] <= pd.to_datetime(end)) &
        (footfall_data['count_date'] >= pd.to_datetime(start))]
    

    columns_to_fill = [
        'resident','worker','visitor',
        'loyalty_percentage','dwell_time'
    ]
    footfall_data.loc[:, columns_to_fill] = footfall_data[columns_to_fill].applymap(lambda x: np.nan if x < 0 else x)
    footfall_data[columns_to_fill] = footfall_data[columns_to_fill].fillna(0)
    footfall_data = footfall_data.sort_values(by=['count_date','time_indicator','hex_id'])
    
    footfall_data = agg_footfall_data(
        footfall_data,
        category='hex_id',
        day_night=True,
        agg='sum'
    )
    footfall_data = apply_features(footfall_data, time='time_indicator')
    
    if kwargs.get('day_night',False):
        footfall_data = transform_to_daynight(footfall_data, category='hex_id')
        averages = footfall_data.copy()
        averages = averages.groupby(['year','week_name','hex_id']).agg(
            Daytime_mean = ('6am-6pm','mean'),
            Nighttime_mean = ('6pm-6am','mean')
        ).reset_index()
        weekday = averages[averages['week_name'] == 'Weekday']
        weekend = averages[averages['week_name'] == 'Weekend']
        typical = footfall_data.groupby(['year','hex_id']).agg(
            Daytime_mean = ('6am-6pm','mean'),
            Nighttime_mean = ('6pm-6am','mean')
        ).reset_index()
    else:
        averages = footfall_data.copy()
        averages = averages.groupby(['year','week_name','hex_id']).agg(
            averages = ('corrected_value_total','mean'),
        ).reset_index()
        weekday = averages[averages['week_name'] == 'Weekday']
        weekend = averages[averages['week_name'] == 'Weekend']
        typical = footfall_data.groupby(['year','hex_id']).agg(
            averages = ('corrected_value_total','mean'),
        ).reset_index()


    return typical, weekday, weekend

Database and CWD setup and connection

In [6]:
# Database credentials
db_host = 'LBHHLWSQL0001.lbhf.gov.uk'
db_port = '1433'
db_name = 'IA_ODS'

# Create the connection string for SQL Server using pyodbc with Windows Authentication
connection_string = f'mssql+pyodbc://@{db_host}:{db_port}/{db_name}?driver=ODBC+Driver+17+for+SQL+Server&Trusted_Connection=yes'

# Create the database engine
engine = create_engine(connection_string)

# Define the current working directory
cwd = r'C:\Users\jf79\OneDrive - Office Shared Service\Documents\H&F Analysis\Footfall and Spend Analysis\Footfall Data\LSOA Based'
os.chdir(cwd)
files = os.listdir(os.getcwd())
print("Files in %r: %s" % (cwd, files))

Files in 'C:\\Users\\jf79\\OneDrive - Office Shared Service\\Documents\\H&F Analysis\\Footfall and Spend Analysis\\Footfall Data\\LSOA Based': ['lsoa_hourly_counts_2022_H1.csv', 'lsoa_hourly_counts_2022_H2.csv', 'lsoa_hourly_counts_2024_H2.csv']


In [7]:
footfall_2024_Hex = pd.read_csv('C:/Users/jf79/OneDrive - Office Shared Service/Documents/H&F Analysis/Footfall and Spend Analysis/Footfall Data/Hex Based/Footfall Counts/hex_3hourly_counts_2024.csv')
relevant_hexes = pd.read_csv('C:/Users/jf79/OneDrive - Office Shared Service/Documents/H&F Analysis/Footfall and Spend Analysis/Footfall Data/Hex Based/Relevant Hexes/Relevant Hexes.csv')

In [8]:
footfall_data_2024 = footfall_2024_Hex.copy()
relevant_hexes_data = relevant_hexes.copy()

footfall_data_2024 = pd.merge(
    relevant_hexes_data,
    footfall_2024_Hex,
    left_on='Hex_ID',
    right_on='hex_id',
    how='left'
)

In [19]:
footfall_data_2024

Unnamed: 0,OID_,Col_ID,Row_ID,Hex_ID,Centroid_X,Centroid_Y,area,Shape_Length,Shape_Area,hex_id,count_date,day,time_indicator,resident,visitor,worker,loyalty_percentage,dwell_time,day_name,week_name
0,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,00-03,618.0,242.0,65.0,4.02,117.03,0,Monday
1,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,03-06,560.0,184.0,76.0,5.31,95.22,0,Monday
2,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,06-09,571.0,149.0,70.0,0.88,150.18,0,Monday
3,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,09-12,683.0,191.0,56.0,3.09,137.77,0,Monday
4,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,12-15,722.0,252.0,86.0,3.98,121.45,0,Monday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600208,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,09-12,,,,14.46,88.88,1,Tuesday
600209,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,12-15,,,11.0,10.73,88.77,1,Tuesday
600210,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,15-18,,,,8.96,86.82,1,Tuesday
600211,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,18-21,12.0,,,10.14,92.77,1,Tuesday


In [147]:
df = agg_footfall_data(footfall_data_2024, category='hex_id', day_night=True)
df.sort_values(by=['hex_id','count_date'], ascending=False, inplace=True)
df[
    (df['count_date'] > pd.to_datetime('2024-12-16')) &
    (df['hex_id'] != 11381218)
].head(30)


Missing kwargs: {'footfall_type', 'agg'}
These args will be set to default values


Unnamed: 0,count_date,corrected_ma_monthly_residents,corrected_ma_weekly_residents,corrected_value_residents,hex_id,day_night,corrected_ma_monthly_workers,corrected_ma_weekly_workers,corrected_value_workers,corrected_ma_monthly_visitors,corrected_ma_weekly_visitors,corrected_value_visitors,year,corrected_value_total,corrected_ma_monthly_total,corrected_ma_weekly_total
110,2024-12-31,,,981.0,11381216,6am-6pm,,,134.0,,,3030.0,2024,3030.0,,
153,2024-12-31,,,925.0,11381216,6pm-6am,,,78.0,,,2030.0,2024,2030.0,,
582,2024-12-30,,,972.0,11381216,6pm-6am,,,44.0,,,1265.0,2024,1265.0,,
701,2024-12-30,,,827.0,11381216,6am-6pm,,,92.0,,,2721.0,2024,2721.0,,
828,2024-12-29,,,928.0,11381216,6pm-6am,,,103.0,,,2829.0,2024,2829.0,,
886,2024-12-29,,,805.0,11381216,6am-6pm,,,118.0,,,2507.0,2024,2507.0,,
1383,2024-12-28,,,853.0,11381216,6pm-6am,,,0.0,,,,2024,0.0,,
1631,2024-12-28,,,813.0,11381216,6am-6pm,,,104.0,,,2698.0,2024,2698.0,,
1654,2024-12-27,,,613.0,11381216,6am-6pm,,,86.0,,,2331.0,2024,2331.0,,
1699,2024-12-27,,,741.0,11381216,6pm-6am,,,51.0,,,1828.0,2024,1828.0,,


In [None]:
london_footfall_2024 = footfall_2024_Hex.copy()
quarterly_averages = typical_footfall(
    london_footfall_2024, '2024-01-01', '2024-12-31'
)
for i in range(len(quarterly_averages)):
    export_to_csv(quarterly_averages[i].sort_values(by='Daytime_sum', ascending=False))

quarterly_averages = typical_footfall(
    footfall_data_2024, '2024-01-01', '2024-12-31',
)
for i in range(len(quarterly_averages)):
    export_to_csv(quarterly_averages[i].sort_values(by='averages', ascending=False))

Missing kwargs: {'footfall_type'}
These args will be set to default values
Invalid time column: 'time_indicator'
Successfully exported typical_hf to CSV
Successfully exported weekday to CSV
Successfully exported weekend to CSV


In [149]:
footfall_data_2024

Unnamed: 0,OID_,Col_ID,Row_ID,Hex_ID,Centroid_X,Centroid_Y,area,Shape_Length,Shape_Area,hex_id,count_date,day,time_indicator,resident,visitor,worker,loyalty_percentage,dwell_time,day_name,week_name,day_night,year
0,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,00-03,618.0,242.0,65.0,4.02,117.03,Monday,Weekday,6pm-6am,2024
1,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,03-06,560.0,184.0,76.0,5.31,95.22,Monday,Weekday,6pm-6am,2024
2,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,06-09,571.0,149.0,70.0,0.88,150.18,Monday,Weekday,6am-6pm,2024
3,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,09-12,683.0,191.0,56.0,3.09,137.77,Monday,Weekday,6am-6pm,2024
4,1,1120,1234,11201234,521148.5154,179650.7975,106088.115,521.032154,5283.915821,11201234,2024-01-01,Mon,12-15,722.0,252.0,86.0,3.98,121.45,Monday,Weekday,6am-6pm,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600208,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,09-12,,,,14.46,88.88,Tuesday,Weekday,6am-6pm,2024
600209,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,12-15,,,11.0,10.73,88.77,Tuesday,Weekday,6am-6pm,2024
600210,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,15-18,,,,8.96,86.82,Tuesday,Weekday,6am-6pm,2024
600211,205,1138,1218,11381218,526604.4754,176850.7975,106088.115,934.139762,57391.090939,11381218,2024-12-31,Tue,18-21,12.0,,,10.14,92.77,Tuesday,Weekday,6pm-6am,2024
