In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 69 bytes


In [None]:
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
# !kaggle datasets download "center-for-policing-equity/data-science-for-good"
!kaggle competitions download -c playground-series-s4e7

playground-series-s4e7.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip playground-series-s4e7.zip

Archive:  playground-series-s4e7.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

In [None]:
train = pd.read_csv('train.csv', index_col=[0])
test = pd.read_csv('test.csv', index_col=[0])

In [None]:
train.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [None]:
test.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [None]:
print(f'train size: {train.shape}')
print(f'test size: {test.shape}')

train size: (11504798, 11)
test size: (7669866, 10)


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                object 
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           object 
 6   Vehicle_Damage        object 
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
 10  Response              int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 1.0+ GB


In [None]:
category_columns = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

In [None]:
train.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,21,1,35.0,0,1,1,65101.0,124.0,187,0
1,0,43,1,28.0,0,2,1,58911.0,26.0,288,1
2,1,25,1,14.0,1,0,0,38043.0,152.0,254,0
3,1,35,1,1.0,0,1,1,2630.0,156.0,76,0
4,1,36,1,15.0,1,1,0,31951.0,152.0,294,0


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                int64  
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           int64  
 6   Vehicle_Damage        int64  
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
 10  Response              int64  
dtypes: float64(3), int64(8)
memory usage: 1.0 GB


In [None]:
train_sampled = train.sample(frac=0.01, random_state = 42)

In [None]:
X = train_sampled.drop(['Response'], axis=1)
y = train_sampled['Response']

In [None]:
print(f'X size: {X.shape}')
print(f'y size: {y.shape}')

X size: (115048, 10)
y size: (115048,)


In [None]:
def modeling(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state =42)
    train_scores = []
    valid_scores = []

    for fold, (train_index, valid_index) in enumerate(tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="Folds"), 1):
        X_skf_train, X_skf_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_skf_train, y_skf_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        scaler = MinMaxScaler()
        #scaler = StandardScaler()
        X_skf_train = scaler.fit_transform(X_skf_train)
        X_skf_valid = scaler.transform(X_skf_valid)

        model.fit(X_skf_train, y_skf_train)

        train_preds = model.predict_proba(X_skf_train)[:, 1]
        train_auc = roc_auc_score(y_skf_train, train_preds)
        train_scores.append(train_auc)

        valid_preds = model.predict_proba(X_skf_valid)[:, 1]
        valid_auc = roc_auc_score(y_skf_valid, valid_preds)
        valid_scores.append(valid_auc)

        print(f'Fold {fold}: Train ROC AUC: {train_auc:.4f}, Validation ROC AUC: {valid_auc:.4f}')

    print(f'Average Train ROC AUC: {sum(train_scores)/len(train_scores):.4f}')
    print(f'Average Validation ROC AUC: {sum(valid_scores)/len(valid_scores):.4f}')

    X_test_scaled = scaler.transform(X_test)
    test_preds = model.predict_proba(X_test_scaled)[:, 1]
    test_auc = roc_auc_score(y_test, test_preds)
    print(f'Test ROC AUC: {test_auc:.4f}')


    return train_scores, valid_scores, test_auc

NameError: name 'StratifiedKFold' is not defined

In [None]:
!pip install catboost xgboost



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [None]:
# Logistic Regression
lr_model = LogisticRegression(random_state=42)
print("Logistic Regression:")
lr_train_scores, lr_valid_scores, lr_test_auc = modeling(lr_model, X, y)

Logistic Regression:


Folds:  20%|██        | 1/5 [00:01<00:04,  1.14s/it]

Fold 1: Train ROC AUC: 0.8346, Validation ROC AUC: 0.8376


Folds:  40%|████      | 2/5 [00:02<00:04,  1.49s/it]

Fold 2: Train ROC AUC: 0.8361, Validation ROC AUC: 0.8307


Folds:  60%|██████    | 3/5 [00:04<00:03,  1.56s/it]

Fold 3: Train ROC AUC: 0.8339, Validation ROC AUC: 0.8407


Folds:  80%|████████  | 4/5 [00:06<00:01,  1.64s/it]

Fold 4: Train ROC AUC: 0.8354, Validation ROC AUC: 0.8344


Folds: 100%|██████████| 5/5 [00:08<00:00,  1.60s/it]

Fold 5: Train ROC AUC: 0.8363, Validation ROC AUC: 0.8313
Average Train ROC AUC: 0.8353
Average Validation ROC AUC: 0.8350
Test ROC AUC: 0.8357





In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
print("\nRandom Forest:")
rf_train_scores, rf_valid_scores, rf_test_auc = modeling(rf_model, X, y)


Random Forest:


Folds:  20%|██        | 1/5 [00:27<01:50, 27.63s/it]

Fold 1: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8451


Folds:  40%|████      | 2/5 [00:40<00:56, 18.88s/it]

Fold 2: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8406


Folds:  60%|██████    | 3/5 [00:55<00:34, 17.00s/it]

Fold 3: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8485


Folds:  80%|████████  | 4/5 [01:13<00:17, 17.58s/it]

Fold 4: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8387


Folds: 100%|██████████| 5/5 [01:27<00:00, 17.44s/it]

Fold 5: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8384
Average Train ROC AUC: 1.0000
Average Validation ROC AUC: 0.8423





Test ROC AUC: 0.8436


In [None]:
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
print("\nXGBoost:")
xgb_train_scores, xgb_valid_scores, xgb_test_auc = modeling(xgb_model, X, y)


XGBoost:


Folds:  20%|██        | 1/5 [00:06<00:25,  6.46s/it]

Fold 1: Train ROC AUC: 0.9225, Validation ROC AUC: 0.8644


Folds:  40%|████      | 2/5 [00:09<00:13,  4.40s/it]

Fold 2: Train ROC AUC: 0.9214, Validation ROC AUC: 0.8608


Folds:  60%|██████    | 3/5 [00:12<00:07,  3.79s/it]

Fold 3: Train ROC AUC: 0.9217, Validation ROC AUC: 0.8664


Folds:  80%|████████  | 4/5 [00:18<00:04,  4.53s/it]

Fold 4: Train ROC AUC: 0.9226, Validation ROC AUC: 0.8582


Folds: 100%|██████████| 5/5 [00:20<00:00,  4.20s/it]

Fold 5: Train ROC AUC: 0.9237, Validation ROC AUC: 0.8564
Average Train ROC AUC: 0.9224
Average Validation ROC AUC: 0.8613
Test ROC AUC: 0.8615





In [None]:
# CatBoost
cat_model = CatBoostClassifier(verbose=0, random_state=42)
print("\nCatBoost:")
cat_train_scores, cat_valid_scores, cat_test_auc = modeling(cat_model, X, y)


CatBoost:


Folds:  20%|██        | 1/5 [00:28<01:55, 28.90s/it]

Fold 1: Train ROC AUC: 0.9072, Validation ROC AUC: 0.8671


Folds:  40%|████      | 2/5 [00:56<01:24, 28.15s/it]

Fold 2: Train ROC AUC: 0.9081, Validation ROC AUC: 0.8620


Folds:  60%|██████    | 3/5 [01:25<00:56, 28.49s/it]

Fold 3: Train ROC AUC: 0.9062, Validation ROC AUC: 0.8707


Folds:  80%|████████  | 4/5 [02:10<00:34, 34.85s/it]

Fold 4: Train ROC AUC: 0.9090, Validation ROC AUC: 0.8602


Folds: 100%|██████████| 5/5 [02:42<00:00, 32.54s/it]

Fold 5: Train ROC AUC: 0.9087, Validation ROC AUC: 0.8597
Average Train ROC AUC: 0.9078
Average Validation ROC AUC: 0.8639
Test ROC AUC: 0.8663





In [None]:
train.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,21,1,35.0,0,1,1,65101.0,124.0,187,0
1,0,43,1,28.0,0,2,1,58911.0,26.0,288,1
2,1,25,1,14.0,1,0,0,38043.0,152.0,254,0
3,1,35,1,1.0,0,1,1,2630.0,156.0,76,0
4,1,36,1,15.0,1,1,0,31951.0,152.0,294,0


In [None]:
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 3000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False
}

In [None]:
# Initialize CatBoost model with hyperparameters
cat_model = CatBoostClassifier(**cat_params)

print("\nCatBoost:")
cat_train_scores, cat_valid_scores, cat_test_auc = modeling(cat_model, X, y)