In [2]:
# import pandas as pd
from src.model import load_data
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.svm import LinearSVC

data = load_data("hackathon.csv")

In [88]:
data = data.query(
    "vulnerable.notnull()"
)  # remove columns where the target variable is not null
data.head()

Unnamed: 0,id,age,income_level,debt_level,region,loan_access,data_accuracy,misidentified_vulnerability,access_to_bank_account,access_to_credit_card,...,access_to_counselling,financial_shocks_annual,marital_status,no_dependents,creditworthy,monthly_expenses,access_financial_education,has_emergency_savings,financially_resilient,employment_status
1000,167e572d-2e8f-4acd-998e-f23c2638c7d3,49,22915.7,35145.79,Urban,False,,,True,False,...,,,,,,,,,,Unemployed
1001,b75dd204-4790-4a22-b67f-41d5a43f35e8,67,48055.16,33226.69,Rural,True,,,True,False,...,,,,,,,,,,Self-employed
1002,fe9b575b-80ea-4970-8bd0-19539d8fdac6,21,77593.44,34633.12,Suburban,True,,,False,True,...,,,,,,,,,,Retired
1003,b8b1cac4-65e2-4e35-a8f2-da360a2be376,81,46981.69,12117.45,Urban,True,,,True,False,...,,,,,,,,,,Unemployed
1004,079fd96b-5aac-4332-85aa-882d7fe9e0d7,71,66805.21,6379.73,Urban,True,,,True,False,...,,,,,,,,,,Unemployed


In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 1000 to 1999
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           1000 non-null   object 
 1   age                          1000 non-null   int64  
 2   income_level                 1000 non-null   float64
 3   debt_level                   1000 non-null   float64
 4   region                       1000 non-null   object 
 5   loan_access                  1000 non-null   object 
 6   data_accuracy                0 non-null      object 
 7   misidentified_vulnerability  0 non-null      object 
 8   access_to_bank_account       1000 non-null   object 
 9   access_to_credit_card        1000 non-null   object 
 10  access_to_insurance          1000 non-null   object 
 11  financial_literacy_score     1000 non-null   float64
 12  vulnerable                   1000 non-null   object 
 13  savings_amount      

In [90]:
data = data.dropna(axis="columns", how="all")

In [91]:
data.head()

Unnamed: 0,id,age,income_level,debt_level,region,loan_access,access_to_bank_account,access_to_credit_card,access_to_insurance,financial_literacy_score,vulnerable,savings_amount,income_stability,credit_score,loan_history,late_payments_yearly,bankruptcies_filed,credit_utilisation_ratio,employment_status
1000,167e572d-2e8f-4acd-998e-f23c2638c7d3,49,22915.7,35145.79,Urban,False,True,False,True,73.0,True,107928.28,Unstable,519.0,True,2.0,0.0,0.77,Unemployed
1001,b75dd204-4790-4a22-b67f-41d5a43f35e8,67,48055.16,33226.69,Rural,True,True,False,True,10.0,False,59130.56,Stable,335.0,False,2.0,0.0,0.32,Self-employed
1002,fe9b575b-80ea-4970-8bd0-19539d8fdac6,21,77593.44,34633.12,Suburban,True,False,True,True,59.0,True,124376.24,Stable,480.0,True,6.0,0.0,0.88,Retired
1003,b8b1cac4-65e2-4e35-a8f2-da360a2be376,81,46981.69,12117.45,Urban,True,True,False,False,40.0,False,83136.27,Stable,495.0,True,9.0,0.0,0.36,Unemployed
1004,079fd96b-5aac-4332-85aa-882d7fe9e0d7,71,66805.21,6379.73,Urban,True,True,False,True,12.0,False,116517.84,Stable,797.0,True,1.0,0.0,0.84,Unemployed


In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 1000 to 1999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        1000 non-null   object 
 1   age                       1000 non-null   int64  
 2   income_level              1000 non-null   float64
 3   debt_level                1000 non-null   float64
 4   region                    1000 non-null   float64
 5   loan_access               1000 non-null   int64  
 6   access_to_bank_account    1000 non-null   int64  
 7   access_to_credit_card     1000 non-null   int64  
 8   access_to_insurance       1000 non-null   object 
 9   financial_literacy_score  1000 non-null   float64
 10  vulnerable                1000 non-null   int64  
 11  savings_amount            1000 non-null   float64
 12  income_stability          1000 non-null   int64  
 13  credit_score              1000 non-null   float64
 14  loan_histo

In [93]:
print("access_to insurance counts:", data["access_to_insurance"].value_counts())
print("income stability:", data["income_stability"].value_counts())

access_to insurance counts: access_to_insurance
True     523
False    477
Name: count, dtype: int64
income stability: income_stability
Stable      742
Unstable    258
Name: count, dtype: int64


In [94]:
# convert TRUE/FALSE dt to numeric
boolean_dt = [
    "loan_access",
    "access_to_bank_account",
    "access_to_credit_card",
    "vulnerable",
    "loan_history",
]

data[boolean_dt] = data[boolean_dt].astype(int)

# options are stable/unstable for "income_stability" change to 1 or 0
data["income_stability"] = data["income_stability"].replace(
    {"Stable": 1, "Unstable": 0}
)

  data['income_stability'] = data['income_stability'].replace({'Stable': 1, 'Unstable': 0})


In [None]:
# encode categorical data
categorical_data = ["region", "employment_status"]

enc = OrdinalEncoder()  # noqa: F821
enc.fit(data[categorical_data])
data[categorical_data] = enc.transform(data[categorical_data])

In [96]:
print("income stability:", data["income_stability"].value_counts())
print("region:", data["region"].value_counts())
print("employment_status:", data["employment_status"].value_counts())

income stability: income_stability
1    742
0    258
Name: count, dtype: int64
region: region
Urban       352
Rural       325
Suburban    323
Name: count, dtype: int64
employment_status: employment_status
Self-employed    290
Employed         257
Retired          234
Unemployed       219
Name: count, dtype: int64


Check for colinearity

In [97]:
trainX = data.drop(columns=["vulnerable"])

In [98]:
data["vulnerable"].value_counts()
# need to normalise the data to have equal numbers of TRUE and FALSE

vulnerable
1    647
0    353
Name: count, dtype: int64

In [99]:
X = trainX
y = data.vulnerable

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=123
)

clf = make_pipeline(StandardScaler(), LinearSVC(dual="auto", random_state=0, tol=1e-5))
models, predictions = clf.fit(
    X_train,
    y_train,
)

print(models)

ValueError: could not convert string to float: 'e830aa0e-028f-4526-b2cd-35930881d094'