In [9]:
import os
import subprocess
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import numpy as np

DATA_DIR = "../data"


if not os.path.exists("../data/cs-training.csv") or not os.path.exists("../data/cs-test.csv"):
    print("Downloading data...")
    # Set paths
    print(os.getcwd())
    os.environ["KAGGLE_CONFIG_DIR"] = "/home/mateo/.kaggle"

    # Download and unzip
    os.makedirs(DATA_DIR, exist_ok=True)
    !kaggle competitions download -c GiveMeSomeCredit -p {DATA_DIR}
    !unzip -o {DATA_DIR}/GiveMeSomeCredit.zip -d {DATA_DIR}


# Load data
df_train = pl.read_csv(f"{DATA_DIR}/cs-training.csv")
df_test = pl.read_csv(f"{DATA_DIR}/cs-test.csv")

df_train = df_train.select(df_train.columns[1:])
df_test = df_test.select(df_test.columns[1:])


def preprocessing(df):
    
    df = df.with_columns([
        pl.when(
            pl.col("MonthlyIncome").is_null() | (pl.col("MonthlyIncome") == "NA")
        ).then(0).otherwise(pl.col("MonthlyIncome")).cast(pl.Int32).alias("MonthlyIncome"),
        pl.when(
            pl.col("NumberOfDependents").is_null() | (pl.col("NumberOfDependents") == "NA")
        ).then(0).otherwise(pl.col("NumberOfDependents")).cast(pl.Int32).alias("NumberOfDependents"),
        pl.col("SeriousDlqin2yrs").fill_null(0).cast(pl.Int32).alias("SeriousDlqin2yrs")

    ])

    df = df.fill_null(0)  


    print(df.shape)
    x = df.drop("SeriousDlqin2yrs").to_numpy()
    y = df["SeriousDlqin2yrs"].to_numpy()

    return x,y

def train(x,y):

    lr = LogisticRegression()
    lr.fit(X=x, y=y)

    return lr

def predict(model, df):
    y_pred = model.predict(df) 
    return y_pred

def evaluate( y_pred, y_real):
    accuracy = accuracy_score(y_real, y_pred)
    return print(f'Accuracy score of {accuracy}')


def save_model(model, filename="../models/model_lr.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(model, f)

x_train, y_train = preprocessing(df_train)
x_test, y_test = preprocessing(df_test)

lr = train(x_train,y_train)
y_pred = predict(lr,x_test)
evaluate(y_pred, y_test)
save_model(lr)


(150000, 11)
(101503, 11)
Accuracy score of 0.9971922012157276


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
df_train

SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
i64,f64,i64,i64,f64,str,i64,i64,i64,i64,str
1,0.766127,45,2,0.802982,"""9120""",13,0,6,0,"""2"""
0,0.957151,40,0,0.121876,"""2600""",4,0,0,0,"""1"""
0,0.65818,38,1,0.085113,"""3042""",2,1,0,0,"""0"""
0,0.23381,30,0,0.03605,"""3300""",5,0,0,0,"""0"""
0,0.9072394,49,1,0.024926,"""63588""",7,0,1,0,"""0"""
…,…,…,…,…,…,…,…,…,…,…
0,0.040674,74,0,0.225131,"""2100""",4,0,1,0,"""0"""
0,0.299745,44,0,0.716562,"""5584""",4,0,1,0,"""2"""
0,0.246044,58,0,3870.0,"""NA""",18,0,1,0,"""0"""
0,0.0,30,0,0.0,"""5716""",4,0,0,0,"""0"""
