In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import os
from pathlib import Path

In [21]:
PROJECT_ROOT = Path.cwd().parent
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

INTERIM_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

In [22]:
data=pd.read_csv(INTERIM_DIR / "Clean_data.csv")
data.head()

Unnamed: 0,company_id,country,sector,employees,annual_revenue,tech_adoption_level,main_challenges,digital_tools_used,growth_last_yr,funding_status,female_owned,remote_work_policy
0,1,Ghana,Education,130,386441,Low,Awareness,"CRM, WhatsApp, E-commerce",11,Seed,No,No Policy
1,2,Rwanda,Farming,367,383576,Low,Internet,WhatsApp,27,Series A,Yes,Partial
2,3,Kenya,Farming,87,496528,Low,"Awareness, Internet","Google My Business, Mobile money, POS",1,Bootstrapped,No,Full
3,4,Kenya,Manufacturing,131,360550,High,Regulation,"Google My Business, E-commerce, POS",6,Unknown,Yes,Full
4,5,Nigeria,Logistics,136,173801,High,"Regulation, Skills",E-commerce,44,Series A,Yes,Partial


In [23]:
data.columns = data.columns.str.strip()

In [24]:
data.columns.to_list()

['company_id',
 'country',
 'sector',
 'employees',
 'annual_revenue',
 'tech_adoption_level',
 'main_challenges',
 'digital_tools_used',
 'growth_last_yr',
 'funding_status',
 'female_owned',
 'remote_work_policy']

In [25]:
data['revenue_per_employee'] = data['annual_revenue']/data['employees']
data['revenue_per_employee'] = data['revenue_per_employee'].replace([np.inf, -np.inf], np.nan)
data.head()

Unnamed: 0,company_id,country,sector,employees,annual_revenue,tech_adoption_level,main_challenges,digital_tools_used,growth_last_yr,funding_status,female_owned,remote_work_policy,revenue_per_employee
0,1,Ghana,Education,130,386441,Low,Awareness,"CRM, WhatsApp, E-commerce",11,Seed,No,No Policy,2972.623077
1,2,Rwanda,Farming,367,383576,Low,Internet,WhatsApp,27,Series A,Yes,Partial,1045.166213
2,3,Kenya,Farming,87,496528,Low,"Awareness, Internet","Google My Business, Mobile money, POS",1,Bootstrapped,No,Full,5707.218391
3,4,Kenya,Manufacturing,131,360550,High,Regulation,"Google My Business, E-commerce, POS",6,Unknown,Yes,Full,2752.290076
4,5,Nigeria,Logistics,136,173801,High,"Regulation, Skills",E-commerce,44,Series A,Yes,Partial,1277.948529


In [26]:
'''if a company is tiny and has low technology use, then it's compliance risk is high and if it has ten to fifteen employees we assume medium risk and larger
companies with better digital tools are considered low risk'''

def compliance_risk(row):
    if row['employees'] < 10 and row["tech_adoption_level"] in ["Low", "None"]:
        return "High"
    elif 10 <= row["employees"] <= 50:
        return "Medium"
    else:
        return "Low"

#applies that function to evert row and createa a new column which is now the taret variaable for the risk model 
data['Compliance_risk'] = data.apply(compliance_risk, axis = 1)

In [27]:
categorical_cols=data.select_dtypes(include='object')
categorical_cols.columns.to_list()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col]=le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

In [28]:
numerical_cols = data.select_dtypes(include=['int64','float64'])
scaler=StandardScaler()

for col in numerical_cols:
    data[col] = scaler.fit_transform(data[[col]])

data.head()

Unnamed: 0,company_id,country,sector,employees,annual_revenue,tech_adoption_level,main_challenges,digital_tools_used,growth_last_yr,funding_status,female_owned,remote_work_policy,revenue_per_employee,Compliance_risk
0,-1.73032,-1.354299,-1.460091,-0.882336,1.013156,0.002425,-1.518684,-1.146463,-0.704946,-0.480279,-0.956927,0.030398,0.076187,-0.275917
1,-1.726856,0.797659,-0.879999,0.758852,0.992366,0.002425,-0.14976,1.297019,0.279014,0.422501,1.045012,1.246322,-0.246873,-0.275917
2,-1.723391,-0.63698,-0.879999,-1.180105,1.812008,0.002425,-1.244899,-0.183167,-1.31992,-1.383059,-0.956927,-1.185525,0.534532,-0.275917
3,-1.719927,-0.63698,0.860276,-0.875411,0.825277,-1.210256,0.534702,-0.300642,-1.012433,1.325281,1.045012,-1.185525,0.039258,-0.275917
4,-1.716463,0.08034,0.280184,-0.840787,-0.529877,-1.210256,1.082271,-1.052483,1.324471,0.422501,1.045012,1.246322,-0.207856,-0.275917


In [29]:
data.columns.to_list()

['company_id',
 'country',
 'sector',
 'employees',
 'annual_revenue',
 'tech_adoption_level',
 'main_challenges',
 'digital_tools_used',
 'growth_last_yr',
 'funding_status',
 'female_owned',
 'remote_work_policy',
 'revenue_per_employee',
 'Compliance_risk']

In [30]:
# Growth model
X_growth = data.drop(columns=['growth_last_yr'])
y_growth = data['growth_last_yr']

# Funding model
X_funding = data.drop(columns=['funding_status'])
y_funding = data['funding_status']

# Compliance risk model
X_risk = data.drop(columns=['Compliance_risk'])
y_risk = data['Compliance_risk']


In [31]:
#Growth Model Split ---
growth_train = Xg_train.copy()
growth_train["growth_last_yr"] = yg_train
growth_test = Xg_test.copy()
growth_test["growth_last_yr"] = yg_test
growth_train.to_csv(PROCESSED_DIR / "growth_train.csv", index=False)
growth_test.to_csv(PROCESSED_DIR / "growth_test.csv", index=False)


#Funding Model Split ---
Xf_train, Xf_test, yf_train, yf_test = train_test_split(X_funding, y_funding, test_size=0.2, random_state=42)
Xf_train.to_csv(PROCESSED_DIR / "funding_train.csv", index=False)
Xf_test.to_csv(PROCESSED_DIR / "funding_test.csv", index=False)

# Compliance Risk Model Split ---
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_risk, y_risk, test_size=0.2, random_state=42)
Xr_train.to_csv(PROCESSED_DIR / "risk_train.csv", index=False)
Xr_test.to_csv(PROCESSED_DIR / "risk_test.csv", index=False)

In [32]:
print(f'The shape of the training for growth is dataset is {Xg_train.shape}')
print(f'The shape of the training for funding is dataset is {Xf_train.shape}')
print(f'The shape of the training for risk is dataset is {Xr_train.shape}')

The shape of the training for growth is dataset is (800, 13)
The shape of the training for funding is dataset is (800, 13)
The shape of the training for risk is dataset is (800, 13)


In [33]:
import pandas as pd
df = pd.read_csv(PROCESSED_DIR / "Clean_data.csv")
df.columns=df.columns.str.strip()

df.head()



Unnamed: 0,company_id,country,sector,employees,annual_revenue,tech_adoption_level,main_challenges,digital_tools_used,growth_last_yr,funding_status,female_owned,remote_work_policy
0,1,Ghana,Education,130,386441,Low,Awareness,"CRM, WhatsApp, E-commerce",11,Seed,No,No Policy
1,2,Rwanda,Farming,367,383576,Low,Internet,WhatsApp,27,Series A,Yes,Partial
2,3,Kenya,Farming,87,496528,Low,"Awareness, Internet","Google My Business, Mobile money, POS",1,Bootstrapped,No,Full
3,4,Kenya,Manufacturing,131,360550,High,Regulation,"Google My Business, E-commerce, POS",6,Unknown,Yes,Full
4,5,Nigeria,Logistics,136,173801,High,"Regulation, Skills",E-commerce,44,Series A,Yes,Partial
