In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('./train.csv')
df = df.sort_index()
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


# Data Handling

In [3]:
cat_cols = df.select_dtypes(include='object').columns.tolist()
df = pd.get_dummies(df, columns=cat_cols).astype(int)

In [4]:
scaler = StandardScaler()
cont_cols = df.select_dtypes(include=['int','float']).drop(columns=['id','loan_status']).columns.tolist()
df[cont_cols] = scaler.fit_transform(df[cont_cols])

In [37]:
df.head()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,person_home_ownership_MORTGAGE,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,0,1.5662,-0.765768,-1.1872,-0.578306,0.298034,0.0,2.031798,0,-0.856727,...,-0.4537,-0.746446,1.369217,-0.481461,-0.306429,-0.132312,-0.05047,-0.023728,0.417419,-0.417419
1,1,-0.920057,-0.212128,0.328047,-0.937775,0.95266,0.0,-0.946489,0,-0.856727,...,-0.4537,-0.746446,-0.730344,2.07701,-0.306429,-0.132312,-0.05047,-0.023728,0.417419,-0.417419
2,2,0.240196,-0.929223,0.83313,-0.578306,-0.683906,0.0,1.039036,0,-0.856727,...,-0.4537,1.339682,-0.730344,-0.481461,-0.306429,-0.132312,-0.05047,-0.023728,0.417419,-0.417419
3,3,0.405947,0.156966,2.348377,0.500101,0.298034,0.0,-0.201917,0,-0.856727,...,2.2041,-0.746446,1.369217,-0.481461,-0.306429,-0.132312,-0.05047,-0.023728,0.417419,-0.417419
4,4,-0.920057,-0.106673,-0.682117,-0.578306,-1.338532,0.0,-0.698298,0,-0.856727,...,-0.4537,1.339682,-0.730344,-0.481461,-0.306429,-0.132312,-0.05047,-0.023728,0.417419,-0.417419


# Feature Engineering

# Model Training

In [6]:
variables = cont_cols + df.iloc[:,9:].columns.tolist()
X = df[variables]
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Acurácia: {accuracy * 100:.2f}%')

Acurácia: 91.65%


# Test dataset

In [7]:
df_test = pd.read_csv('./test.csv')

In [8]:
df_test = pd.get_dummies(df_test, columns=cat_cols).astype(int)

In [9]:
df_test[cont_cols] = scaler.transform(df_test[cont_cols])

In [10]:
df_test['loan_status'] = model.predict(df_test[variables])

In [11]:
df_test[['id', 'loan_status']].to_parquet('./result_v1.parquet')