In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import os
from pathlib import Path

In [31]:
data=pd.read_csv(r'C:\Users\USER\Desktop\Inua360-The-Kenyan-SME-AI-Agent\data\external\african_sme_dataset.csv')
data.columns = data.columns.str.strip()
data = data.drop(columns=['company_id'], errors='ignore')
data.head()

Unnamed: 0,country,region,sector,subsector,years_in_operation,ownership_type,female_owned,annual_revenue,annual_profit,expenses_total,...,tax_compliance_status,business_license_validity,environmental_compliance,data_protection_compliance,audit_score,country_gdp_growth,sector_avg_growth,inflation_rate,internet_penetration_rate,ease_of_doing_business_rank
0,Tanzania,Peri-urban,Finance,Food,13,Limited,0,22847.261331,4338.714567,108934.421097,...,1,0,1,0,0.414355,0.05,0.051449,0.073079,0.539061,117
1,Ghana,Peri-urban,Agriculture,Machinery,21,Limited,1,100289.323806,11746.15828,61964.015629,...,1,0,1,0,0.504764,0.04,0.062719,0.131761,0.68453,38
2,Rwanda,Peri-urban,Manufacturing,Pharma,24,Partnership,1,67572.2395,32378.608765,68466.650386,...,1,1,1,1,0.291753,0.07,0.109072,0.04907,0.469032,143
3,Tanzania,Rural,Agriculture,Clothing,8,Partnership,0,155518.879067,33065.534483,85280.928035,...,0,1,1,0,0.745993,0.06,0.147247,0.081065,0.439221,128
4,South Africa,Peri-urban,Agriculture,Banking,12,Limited,0,145489.69464,17416.78807,66962.799441,...,1,1,1,1,0.405217,0.07,0.070909,0.121132,0.601534,198


In [32]:
data.columns.to_list()

['country',
 'region',
 'sector',
 'subsector',
 'years_in_operation',
 'ownership_type',
 'female_owned',
 'annual_revenue',
 'annual_profit',
 'expenses_total',
 'revenue_growth_rate',
 'funding_received',
 'funding_stage',
 'credit_access',
 'debt_ratio',
 'cashflow_stability_score',
 'num_employees',
 'employee_growth_rate',
 'avg_employee_salary',
 'training_investment_per_employee',
 'remote_work_policy',
 'tech_adoption_level',
 'digital_spending_ratio',
 'online_presence',
 'social_media_activity_score',
 'ecommerce_usage',
 'cloud_services_used',
 'cybersecurity_measures',
 'num_customers',
 'customer_growth_rate',
 'customer_retention_rate',
 'competition_intensity',
 'average_customer_ticket_size',
 'tax_compliance_status',
 'business_license_validity',
 'environmental_compliance',
 'data_protection_compliance',
 'audit_score',
 'country_gdp_growth',
 'sector_avg_growth',
 'inflation_rate',
 'internet_penetration_rate',
 'ease_of_doing_business_rank']

In [33]:
data['profit_margin'] = (data['annual_profit'] / data['annual_revenue']).replace([np.inf, -np.inf], 0).fillna(0)
data['expense_ratio'] = (data['expenses_total'] / data['annual_revenue']).replace([np.inf, -np.inf], 0).fillna(0)
data['employee_efficiency'] = data['annual_profit'] / (data['num_employees'] + 1)
data['financial_health_index'] = (
    data['cashflow_stability_score'] + (1 - data['debt_ratio']) + data['credit_access']
) / 3
data['compliance_score'] = (
    data['audit_score'] + data['environmental_compliance'] +
    data['data_protection_compliance'] + data['tax_compliance_status']
) / 4
data['market_resilience'] = (
    data['country_gdp_growth'] + data['sector_avg_growth'] -
    data['inflation_rate'] + data['ease_of_doing_business_rank']
) / 4

In [34]:
data.head()

Unnamed: 0,country,region,sector,subsector,years_in_operation,ownership_type,female_owned,annual_revenue,annual_profit,expenses_total,...,sector_avg_growth,inflation_rate,internet_penetration_rate,ease_of_doing_business_rank,profit_margin,expense_ratio,employee_efficiency,financial_health_index,compliance_score,market_resilience
0,Tanzania,Peri-urban,Finance,Food,13,Limited,0,22847.261331,4338.714567,108934.421097,...,0.051449,0.073079,0.539061,117,0.189901,4.767942,9.860715,0.537182,0.603589,29.257093
1,Ghana,Peri-urban,Agriculture,Machinery,21,Limited,1,100289.323806,11746.15828,61964.015629,...,0.062719,0.131761,0.68453,38,0.117123,0.617853,85.738382,0.749023,0.626191,9.492739
2,Rwanda,Peri-urban,Manufacturing,Pharma,24,Partnership,1,67572.2395,32378.608765,68466.650386,...,0.109072,0.04907,0.469032,143,0.47917,1.013236,121.268198,0.801761,0.822938,35.7825
3,Tanzania,Rural,Agriculture,Clothing,8,Partnership,0,155518.879067,33065.534483,85280.928035,...,0.147247,0.081065,0.439221,128,0.212614,0.548364,78.915357,0.253902,0.436498,32.031545
4,South Africa,Peri-urban,Agriculture,Banking,12,Limited,0,145489.69464,17416.78807,66962.799441,...,0.070909,0.121132,0.601534,198,0.119711,0.460258,217.709851,0.234617,0.851304,49.504944


In [35]:
targets = ['funding_stage', 'revenue_growth_rate', 'tax_compliance_status', 'compliance_score']

# Drop the target columns correctly
data_features = data.drop(columns=targets, errors='ignore')

data_features.head()


Unnamed: 0,country,region,sector,subsector,years_in_operation,ownership_type,female_owned,annual_revenue,annual_profit,expenses_total,...,country_gdp_growth,sector_avg_growth,inflation_rate,internet_penetration_rate,ease_of_doing_business_rank,profit_margin,expense_ratio,employee_efficiency,financial_health_index,market_resilience
0,Tanzania,Peri-urban,Finance,Food,13,Limited,0,22847.261331,4338.714567,108934.421097,...,0.05,0.051449,0.073079,0.539061,117,0.189901,4.767942,9.860715,0.537182,29.257093
1,Ghana,Peri-urban,Agriculture,Machinery,21,Limited,1,100289.323806,11746.15828,61964.015629,...,0.04,0.062719,0.131761,0.68453,38,0.117123,0.617853,85.738382,0.749023,9.492739
2,Rwanda,Peri-urban,Manufacturing,Pharma,24,Partnership,1,67572.2395,32378.608765,68466.650386,...,0.07,0.109072,0.04907,0.469032,143,0.47917,1.013236,121.268198,0.801761,35.7825
3,Tanzania,Rural,Agriculture,Clothing,8,Partnership,0,155518.879067,33065.534483,85280.928035,...,0.06,0.147247,0.081065,0.439221,128,0.212614,0.548364,78.915357,0.253902,32.031545
4,South Africa,Peri-urban,Agriculture,Banking,12,Limited,0,145489.69464,17416.78807,66962.799441,...,0.07,0.070909,0.121132,0.601534,198,0.119711,0.460258,217.709851,0.234617,49.504944


In [36]:
numerical_cols=data.select_dtypes(include=['int64','float64']).columns
numerical_cols

Index(['years_in_operation', 'female_owned', 'annual_revenue', 'annual_profit',
       'expenses_total', 'revenue_growth_rate', 'funding_received',
       'credit_access', 'debt_ratio', 'cashflow_stability_score',
       'num_employees', 'employee_growth_rate', 'avg_employee_salary',
       'training_investment_per_employee', 'digital_spending_ratio',
       'online_presence', 'social_media_activity_score', 'ecommerce_usage',
       'cloud_services_used', 'cybersecurity_measures', 'num_customers',
       'customer_growth_rate', 'customer_retention_rate',
       'competition_intensity', 'average_customer_ticket_size',
       'tax_compliance_status', 'business_license_validity',
       'environmental_compliance', 'data_protection_compliance', 'audit_score',
       'country_gdp_growth', 'sector_avg_growth', 'inflation_rate',
       'internet_penetration_rate', 'ease_of_doing_business_rank',
       'profit_margin', 'expense_ratio', 'employee_efficiency',
       'financial_health_index', 'c

In [37]:
categorical_cols = data.select_dtypes(include='object').columns
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)


In [39]:
numerical_cols = ['years_in_operation', 'female_owned', 'annual_revenue',
       'annual_profit', 'expenses_total', 'revenue_growth_rate',
       'funding_received', 'credit_access', 'debt_ratio',
       'cashflow_stability_score', 'num_employees', 'employee_growth_rate',
       'avg_employee_salary', 'training_investment_per_employee',
       'digital_spending_ratio', 'online_presence',
       'social_media_activity_score', 'ecommerce_usage', 'cloud_services_used',
       'cybersecurity_measures', 'num_customers', 'customer_growth_rate',
       'customer_retention_rate', 'competition_intensity',
       'average_customer_ticket_size', 'tax_compliance_status',
       'business_license_validity', 'environmental_compliance',
       'data_protection_compliance', 'audit_score', 'country_gdp_growth',
       'sector_avg_growth', 'inflation_rate', 'internet_penetration_rate',
       'ease_of_doing_business_rank', 'profit_margin', 'expense_ratio',
       'employee_efficiency', 'financial_health_index', 'compliance_score',
       'market_resilience']

scaler = StandardScaler()

for col in numerical_cols:
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])