In [1]:
import pandas as pd
import chardet

# Detect CSV encoding
file_path = r"./../data tables/STTM_CUST_ACCOUNT.csv"

# Loading only few required columns
cols_to_import = ["AC_OPEN_DATE", "ACCOUNT_CLASS", "ACY_CURR_BALANCE", "BRANCH_CODE", "CUST_AC_NO"]
df = pd.read_csv(file_path, usecols=cols_to_import, encoding="cp1252")
df.head()

  df = pd.read_csv(file_path, usecols=cols_to_import, encoding="cp1252")


Unnamed: 0,AC_OPEN_DATE,ACCOUNT_CLASS,ACY_CURR_BALANCE,BRANCH_CODE,CUST_AC_NO
0,02/03/2003 00:00:00,LINAC,0,1.0,1000018285
1,02/03/2003 00:00:00,LINAC,534197,1.0,1000018308
2,02/03/2003 00:00:00,LINAC,0,1.0,1000018321
3,02/03/2003 00:00:00,LINAC,0,1.0,1000018354
4,10/09/2002 00:00:00,PERSSA,0,1.0,1000018591


In [2]:
# Loading Branch table for fetching branch names
branches_df = pd.read_csv(r"./../data tables/STTM_BRANCH.csv")
branches_df.head()

Unnamed: 0,BRANCH_CODE,BRANCH_NAME,BRANCH_ADDR1,BRANCH_ADDR2,BRANCH_ADDR3,REGIONAL_OFFICE
0,16,TUNDUMA BRANCH,AZANIA BANK LIMITED,TUNDUMA BRANCH,"MBEYA, TANZANIA",999
1,17,RWAGASORE AGENCY,AZANIA BANK LIMITED,RWAGASORE BRANCH,"MWANZA, TANZANIA",999
2,19,MOROGORO BRANCH,AZANIA BANK LIMITED,MOROGORO BRANCH,"MOROGORO, TANZANIA",999
3,18,SOKOINE BRANCH,AZANIA BANK LIMITED,SOKOINE BRANCH,"DODOMA, TANZANIA",999
4,0,AZANIA BANK LIMITED,"MASDO HOUSE, SAMORA AVENUE",P O BOX 9271,"DAR ES SALAAM, TANZANIA",999


In [3]:
# PRE-PROCESSING Main Registration Dataset

# Cleaning and filtering Dates (Jan to June)
df["AC_OPEN_DATE"] = pd.to_datetime(df["AC_OPEN_DATE"], errors="coerce")
mask_dates = (df["AC_OPEN_DATE"] >= "2025-01-01") & (df["AC_OPEN_DATE"] <= "2025-06-30")
df = df[mask_dates]

# --- 2. Sort dates old → new ---
df = df.sort_values(by="AC_OPEN_DATE", ascending=True)

# Format BRANCH to 3-digit codes
df["BRANCH_CODE"] = (
    pd.to_numeric(df["BRANCH_CODE"], errors="coerce")
    .fillna(0)
    .astype(int)
    .astype(str)
    .str.zfill(3)
)

# Format CUST_AC_NO to 12-digit codes (remove negatives)
df["CUST_AC_NO"] = (
    pd.to_numeric(df["CUST_AC_NO"], errors="coerce")
    .fillna(0)
    .astype(int)
    .abs()
    .astype(str)
    .str.zfill(12)
)
df.head()

Unnamed: 0,AC_OPEN_DATE,ACCOUNT_CLASS,ACY_CURR_BALANCE,BRANCH_CODE,CUST_AC_NO
389127,2025-01-02,PERSSA,4850,14,1115251419
394844,2025-01-02,KIBISA,5000,18,820316164
394845,2025-01-02,HOPES,5635,29,1064748189
398544,2025-01-02,KIBISA,0,15,2115295773
398545,2025-01-02,KIBISA,0,15,2115295806


In [4]:
#Cleaning and formatting Branch table dataset 

# Ensure branch codes are padded to 3 digits
df["BRANCH_CODE"] = df["BRANCH_CODE"].astype(str).str.zfill(3)
branches_df["BRANCH_CODE"] = branches_df["BRANCH_CODE"].astype(str).str.zfill(3)

# Merge only the branch name column
df = df.merge(
    branches_df[["BRANCH_CODE", "BRANCH_NAME"]],
    on="BRANCH_CODE",
    how="left"
)

# Verify
df.head()

Unnamed: 0,AC_OPEN_DATE,ACCOUNT_CLASS,ACY_CURR_BALANCE,BRANCH_CODE,CUST_AC_NO,BRANCH_NAME
0,2025-01-02,PERSSA,4850,14,1115251419,LAMADI AGENCY
1,2025-01-02,KIBISA,5000,18,820316164,SOKOINE BRANCH
2,2025-01-02,HOPES,5635,29,1064748189,BANDARI
3,2025-01-02,KIBISA,0,15,2115295773,KAGONGWA AGENCY
4,2025-01-02,KIBISA,0,15,2115295806,KAGONGWA AGENCY


In [5]:
# Excluding some Account products
unwanted = ['LINAC','NOSTRO','SPDED','SPDSS','SPDMG','SPDINS','NSFBFC','CBCLLC','CBCLFC','NSFBFC','NSUPLC','NSUPFC', '1YTD', 'FLTD', 'TDNORM']
unwanted = [u.strip().casefold() for u in unwanted]

# normalizing ACCOUNT_CLASS then filter (drop rows where ACCOUNT_CLASS is in unwanted)
ACCOUNT_CLASS_norm = (
    df["ACCOUNT_CLASS"]
      .astype(str)
      .str.replace("\u00A0", " ", regex=False)           # NBSP → space
      .str.replace(r"[\u200B-\u200D\uFEFF]", "", regex=True)  # zero-width chars
      .str.strip()
      .str.casefold()                                    # case-insensitive normalization
)

mask_ACCOUNT_CLASS = ~ACCOUNT_CLASS_norm.isin(unwanted)

# class filtering now:
df_filtered = df[mask_ACCOUNT_CLASS].copy()

print("Rows removed by class filter:", (~mask_ACCOUNT_CLASS).sum())
print(df_filtered["ACCOUNT_CLASS"].unique()[:20])

Rows removed by class filter: 5453
['PERSSA' 'KIBISA' 'HOPES' 'JISA' 'SALASA' 'MSTASA' 'AGBA' 'ASILSA' 'AGBC'
 'LUMBSA' 'HOSAA' 'PERSAS' 'PREPAS' 'SOCLCA' 'GEPGC' 'SOCLSA' 'CORPCA'
 'PERSCA' 'HOCSA' 'CHSCCA']


In [6]:
#verify
df_filtered.head()

Unnamed: 0,AC_OPEN_DATE,ACCOUNT_CLASS,ACY_CURR_BALANCE,BRANCH_CODE,CUST_AC_NO,BRANCH_NAME
0,2025-01-02,PERSSA,4850,14,1115251419,LAMADI AGENCY
1,2025-01-02,KIBISA,5000,18,820316164,SOKOINE BRANCH
2,2025-01-02,HOPES,5635,29,1064748189,BANDARI
3,2025-01-02,KIBISA,0,15,2115295773,KAGONGWA AGENCY
4,2025-01-02,KIBISA,0,15,2115295806,KAGONGWA AGENCY


In [7]:
# Export to Excel
output_path = r"./registration_clean_data.xlsx"
df_filtered.to_excel(output_path, index=False)

print(f"Filtered data exported to: {output_path}")

Filtered data exported to: ./registration_clean_data.xlsx


In [8]:
print("Shape of dataset:", df_filtered.shape)

Shape of dataset: (93061, 6)


In [9]:
import numpy as np

# --- Ensuring types are correct ---
df_filtered["AC_OPEN_DATE"] = pd.to_datetime(df_filtered["AC_OPEN_DATE"], errors="coerce")
df_filtered["ACY_CURR_BALANCE"] = pd.to_numeric(df_filtered.get("ACY_CURR_BALANCE"), errors="coerce")

# --- Global metrics ---
total_regs        = len(df_filtered)
total_accounts    = df_filtered["CUST_AC_NO"].nunique()
unique_classes    = df_filtered["ACCOUNT_CLASS"].nunique()
unique_branches   = df_filtered["BRANCH_NAME"].nunique()
period_start      = (df_filtered["AC_OPEN_DATE"].min().date()
                     if not df_filtered["AC_OPEN_DATE"].isna().all() else None)
period_end        = (df_filtered["AC_OPEN_DATE"].max().date()
                     if not df_filtered["AC_OPEN_DATE"].isna().all() else None)
total_balance     = df_filtered["ACY_CURR_BALANCE"].sum(skipna=True)
avg_bal           = df_filtered["ACY_CURR_BALANCE"].mean()
median_bal        = df_filtered["ACY_CURR_BALANCE"].median()
zero_bal_cnt      = (df_filtered["ACY_CURR_BALANCE"].fillna(0) == 0).sum()
zero_bal_rate     = (zero_bal_cnt / total_regs) if total_regs else np.nan

# --- Branch-wise summary ---
branch_summary = (
    df_filtered
      .groupby("BRANCH_NAME", dropna=False)
      .agg(Registrations=("CUST_AC_NO", "count"),
           Total_Balance=("ACY_CURR_BALANCE", "sum"),
           Avg_Balance=("ACY_CURR_BALANCE", "mean"),
           Median_Balance=("ACY_CURR_BALANCE", "median"),
           Zero_Balance_Accounts=("ACY_CURR_BALANCE", lambda s: (s.fillna(0) == 0).sum()))
      .reset_index()
)
branch_summary["Reg_Share_%"] = (branch_summary["Registrations"] / max(total_regs, 1) * 100).round(2)
branch_summary["Balance_Share_%"] = np.where(
    total_balance != 0,
    (branch_summary["Total_Balance"] / total_balance * 100).round(2),
    np.nan
)
branch_summary = branch_summary.sort_values(["Registrations", "Total_Balance"], ascending=[False, False])

# --- Account-class-wise summary (with balances) ---
class_summary = (
    df_filtered
      .groupby("ACCOUNT_CLASS", dropna=False)
      .agg(Registrations=("CUST_AC_NO", "count"),
           Total_Balance=("ACY_CURR_BALANCE", "sum"),
           Avg_Balance=("ACY_CURR_BALANCE", "mean"),
           Median_Balance=("ACY_CURR_BALANCE", "median"),
           Zero_Balance_Accounts=("ACY_CURR_BALANCE", lambda s: (s.fillna(0) == 0).sum()))
      .reset_index()
      .sort_values(["Registrations", "Total_Balance"], ascending=[False, False])
)
class_summary["Reg_Share_%"] = (class_summary["Registrations"] / max(total_regs, 1) * 100).round(2)
class_summary["Balance_Share_%"] = np.where(
    total_balance != 0,
    (class_summary["Total_Balance"] / total_balance * 100).round(2),
    np.nan
)

# --- Monthly trend (registrations & balances) ---
monthly = (
    df_filtered
      .set_index("AC_OPEN_DATE")
      .groupby(pd.Grouper(freq="MS"))
      .agg(Registrations=("CUST_AC_NO", "count"),
           Unique_Customers=("CUST_AC_NO", "nunique"),
           Total_Balance=("ACY_CURR_BALANCE", "sum"))
      .reset_index()
)
monthly["Month"] = monthly["AC_OPEN_DATE"].dt.strftime("%Y-%m")
monthly = monthly.drop(columns=["AC_OPEN_DATE"])

# --- High-level summary table ---
top_branch_by_regs = branch_summary.iloc[0]["BRANCH_NAME"] if not branch_summary.empty else None
top_class_by_regs  = class_summary.iloc[0]["ACCOUNT_CLASS"] if not class_summary.empty else None

summary = {
    "Total Registrations": total_regs,
    "Number of Account Classes": unique_classes,
    "Number of Branches": unique_branches,
    "Total Balance": total_balance,
    "Average Balance per Account": avg_bal,
    "Median Balance per Registration": median_bal,
    "Zero Balance Accounts": zero_bal_cnt,
    "Zero Balance Rate %": (round(zero_bal_rate * 100, 2) if pd.notna(zero_bal_rate) else np.nan),
    "Top Branch by Registrations": top_branch_by_regs,
    "Top Account Class by Registrations": top_class_by_regs,
    "Period Covered": f"{period_start} to {period_end}"
}
summary_df = pd.DataFrame(list(summary.items()), columns=["Metric", "Value"])

# --- (Optional) export to Excel with multiple sheets ---
out_path = r"./registration_summary_data.xlsx"
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
     summary_df.to_excel(writer, sheet_name="Summary", index=False)
     branch_summary.to_excel(writer, sheet_name="By Branch", index=False)
     class_summary.to_excel(writer, sheet_name="By Class", index=False)
     monthly.to_excel(writer, sheet_name="Monthly Trend", index=False)
print("Report saved to:", out_path)

# Quick peek
summary_df, branch_summary.head(10), class_summary.head(10), monthly.head()

Report saved to: ./registration_summary_data.xlsx


(                                Metric                     Value
 0                  Total Registrations                     93061
 1            Number of Account Classes                        47
 2                   Number of Branches                        39
 3                        Total Balance              413101984633
 4          Average Balance per Account            4439045.192218
 5      Median Balance per Registration                       0.0
 6                Zero Balance Accounts                     61360
 7                  Zero Balance Rate %                     65.94
 8          Top Branch by Registrations             KAHAMA BRANCH
 9   Top Account Class by Registrations                     HOPES
 10                      Period Covered  2025-01-02 to 2025-06-30,
                         BRANCH_NAME  Registrations  Total_Balance  \
 11                    KAHAMA BRANCH           7250      628564976   
 38                   TUNDUMA BRANCH           6366      423815078 