In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [33]:
# Carregar dados
train_df = pd.read_csv("application_train.csv")
test_df = pd.read_csv("application_test.csv")

# Visualizar dados
train_df.head()


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Seleção de algumas features numéricas simples
features = ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "DAYS_BIRTH", "AMT_CREDIT"]

# Preencher valores ausentes com a mediana
X = train_df[features].fillna(train_df[features].median())
y = train_df["TARGET"]

X_test = test_df[features].fillna(train_df[features].median())


In [35]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)


In [37]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Previsões
y_pred_valid = model.predict_proba(X_valid_scaled)[:,1]


In [38]:
auc_score = roc_auc_score(y_valid, y_pred_valid)
print(f"Validation ROC-AUC score: {auc_score:.4f}")

Validation ROC-AUC score: 0.7204


In [39]:
submission = pd.DataFrame({
    "SK_ID_CURR": test_df["SK_ID_CURR"],
    "TARGET": model.predict_proba(X_test_scaled)[:,1]
})

submission.to_csv("baseline_submission.csv", index=False)


In [40]:
# Criar algumas features adicionais
train_df["CREDIT_INCOME_RATIO"] = train_df["AMT_CREDIT"] / train_df["AMT_INCOME_TOTAL"]
test_df["CREDIT_INCOME_RATIO"] = test_df["AMT_CREDIT"] / test_df["AMT_INCOME_TOTAL"]

features.append("CREDIT_INCOME_RATIO")

# Repetir treino e avaliação com nova feature
X = train_df[features].fillna(train_df[features].median())
X_test = test_df[features].fillna(train_df[features].median())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)
y_pred_valid = model.predict_proba(X_valid_scaled)[:,1]

auc_score = roc_auc_score(y_valid, y_pred_valid)
print(f"Validation ROC-AUC score after feature engineering: {auc_score:.4f}")


Validation ROC-AUC score after feature engineering: 0.7213


In [None]:
# Conclusion

# The baseline Logistic Regression model on simple numerical features achieved a validation ROC-AUC score of 0.7204, providing a solid reference point for further improvements.

# Adding a new feature (CREDIT_INCOME_RATIO) slightly improved the performance to 0.7213, demonstrating that even simple feature engineering can positively affect model accuracy.

# The workflow illustrates the essential steps in a machine learning project: data preprocessing, feature selection, model training, validation, and preparing predictions for submission.

# Further improvements can be made by:

# Incorporating more advanced feature engineering (e.g., aggregating external datasets, creating interaction features).

# Experimenting with more complex models such as Random Forest, XGBoost, or LightGBM.

# Performing hyperparameter tuning and cross-validation for better generalization.

# Practical takeaway:

# The baseline model is a starting point. It allows understanding of the pipeline and submission flow before moving to more sophisticated approaches.

# Small feature adjustments can lead to measurable improvements in ROC-AUC, highlighting the importance of feature engineering.