In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import os
from pathlib import Path

In [None]:
processed_dir = Path(r"C:\Users\USER\Desktop\Inua360-The-Kenyan-SME-AI-Agent\data\processed")
processed_dir.mkdir(parents=True, exist_ok=True)

In [None]:
data=pd.read_csv(r'C:\Users\USER\Desktop\Inua360-The-Kenyan-SME-AI-Agent\data\interim\Clean_data.csv')
data.head()

In [None]:
data.columns = data.columns.str.strip()

In [None]:
data.columns.to_list()

In [None]:
data['revenue_per_employee'] = data['annual_revenue']/data['employees']
data['revenue_per_employee'] = data['revenue_per_employee'].replace([np.inf, -np.inf], np.nan)
data.head()

In [None]:
'''if a company is tiny and has low technology use, then it's compliance risk is high and if it has ten to fifteen employees we assume medium risk and larger
companies with better digital tools are considered low risk'''

def compliance_risk(row):
    if row['employees'] < 10 and row["tech_adoption_level"] in ["Low", "None"]:
        return "High"
    elif 10 <= row["employees"] <= 50:
        return "Medium"
    else:
        return "Low"

#applies that function to evert row and createa a new column which is now the taret variaable for the risk model 
data['Compliance_risk'] = data.apply(compliance_risk, axis = 1)

In [None]:
categorical_cols=data.select_dtypes(include='object')
categorical_cols.columns.to_list()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col]=le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

In [None]:
numerical_cols = data.select_dtypes(include=['int64','float64'])
scaler=StandardScaler()

for col in numerical_cols:
    data[col] = scaler.fit_transform(data[[col]])

data.head()

In [None]:
data.columns.to_list()

In [None]:
# Growth model
X_growth = data.drop(columns=['growth_last_yr'])
y_growth = data['growth_last_yr']

# Funding model
X_funding = data.drop(columns=['funding_status'])
y_funding = data['funding_status']

# Compliance risk model
X_risk = data.drop(columns=['Compliance_risk'])
y_risk = data['Compliance_risk']


In [None]:
#Growth Model Split ---
Xg_train, Xg_test, yg_train, yg_test = train_test_split(X_growth, y_growth, test_size=0.2, random_state=42)
Xg_train.to_csv(processed_dir / "growth_train.csv", index=False)
Xg_test.to_csv(processed_dir / "growth_test.csv", index=False)

#Funding Model Split ---
Xf_train, Xf_test, yf_train, yf_test = train_test_split(X_funding, y_funding, test_size=0.2, random_state=42)
Xf_train.to_csv(processed_dir / "funding_train.csv", index=False)
Xf_test.to_csv(processed_dir / "funding_test.csv", index=False)

# Compliance Risk Model Split ---
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_risk, y_risk, test_size=0.2, random_state=42)
Xr_train.to_csv(processed_dir / "risk_train.csv", index=False)
Xr_test.to_csv(processed_dir / "risk_test.csv", index=False)

In [None]:
print(f'The shape of the training for growth is dataset is {Xg_train.shape}')
print(f'The shape of the training for funding is dataset is {Xg_train.shape}')
print(f'The shape of the training for risk is dataset is {Xg_train.shape}')