In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load Data

Upload the files to your Google Drive and mount it in the Colab notebook using the code below.

Files: `promos.parquet`, `test_history.parquet`, `train_history.parquet`, and `transactions.parquet`.

In [2]:
# This will ask you to give Colab permission to access your Google Drive
# and enter an authorization code.
# See: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=u22w3BFiOveA
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# MODIFY THIS LINE with the path to where you saved the datafiles on your Google drive
path = '/content/drive/MyDrive/RSM8421 Assignment 4'

In [4]:
promos = pd.read_parquet(path + '/promos.parquet')
promos.head()

Unnamed: 0,promo,category,promoqty,manufacturer,promoval,brand
0,209524,51006740,1,1593002228,1.0,189092190
1,176321,42760698,1,1957688344,0.75,1386413202
2,17125,56844954,1,629075831,1.5,1429623935
3,177412,56844954,1,629075831,1.5,1429623935
4,69839,56844954,1,629075831,1.5,1429623935


In [5]:
train_history = pd.read_parquet(path + '/train_history.parquet')
train_history.head()

Unnamed: 0,id,store,promo,region,promodate,active
0,115562959531,860548,176321,243820,2013-03-25,0.0
1,175261390705,625933,37568,191747,2013-03-15,0.0
2,273751574633,527828,209524,165933,2013-03-23,1.0
3,166923268906,1056730,214816,5506,2013-03-20,0.0
4,135410903443,241369,176321,278897,2013-03-25,0.0


In [6]:
test_history = pd.read_parquet(path + '/test_history.parquet')
test_history.head()

Unnamed: 0,id,store,promo,region,promodate,active
0,250462324156,457755,96852,191747,2013-04-22,
1,61214568335,740794,54956,203586,2013-04-23,
2,274776763999,394092,95011,42773,2013-04-02,
3,91546215019,876095,108237,323706,2013-04-24,
4,220718580189,625933,222088,191747,2013-04-18,


In [7]:
transactions = pd.read_parquet(path + '/transactions.parquet')
transactions.head()

Unnamed: 0,id,store,market_group,category,manufacturer,brand,date,productsize,measure,qty,amt
0,31924309471,96431,360548,10881580,707582354,109487351,2012-03-06,28.0,OZ,1,2.69
1,31924309471,96431,80472,75959720,2052533430,254982935,2012-03-06,12.0,OZ,2,8.48
2,31924309471,96431,194893,80209099,1640770606,1252530761,2012-03-06,6.0,OZ,1,1.69
3,31924309471,96431,27828,10942974,946555356,2133681340,2012-03-06,0.75,LT,1,11.41
4,31924309471,96431,135933,64737859,700393013,221618334,2012-03-06,33.84,OZ,1,3.19


# Extract Features

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

In [9]:
# Merge promotion info
train_history = train_history.merge(promos, on="promo", how="left")
test_history = test_history.merge(promos, on="promo", how="left")

In [11]:
import pandas as pd
import numpy as np
import gc

def feature_engineering(df, transactions, train_df, promos=None):
    """
    Optimized memory-efficient, past-only feature engineering for promotion response classification.
    Focuses on high-impact features with minimal runtime.

    Args:
        df: train_history or test_history dataframe
        transactions: transaction history dataframe
        train_df: training set for computing target encodings
        promos: promos dataframe with promotion details (optional but recommended)
    """
    df = df.copy()
    transactions = transactions.copy()  # Prevent modifying original
    smooth_k = 5
    sparse_thresh = 10
    recent_window_days = 30

    print(f"Processing {len(df)} records...")

    # === MERGE PROMOTION DETAILS FIRST (CRITICAL) ===
    if promos is not None:
        promo_cols = ['promo']
        for col in ['promoqty', 'manufacturer', 'promoval']:
            if col in promos.columns and col not in df.columns:
                promo_cols.append(col)

        if len(promo_cols) > 1:
            promos_clean = promos[promo_cols].drop_duplicates('promo')
            df = df.merge(promos_clean, on='promo', how='left')

            # Promotion-level features
            if 'promoval' in df.columns and 'promoqty' in df.columns:
                df['promo_price_per_unit'] = (df['promoval'] / df['promoqty'].replace(0, np.nan)).astype(np.float32)

    # Handle rare categories (vectorized)
    for col, replace_val in [("brand","Other"), ("promo",-1), ("manufacturer","Other")]:
        if col in df.columns:
            rare_vals = train_df[col].value_counts()[lambda x: x < sparse_thresh].index
            df[col] = df[col].replace(rare_vals, replace_val)
            if col in train_df.columns:
                train_df[col] = train_df[col].replace(rare_vals, replace_val)

    # Convert types for memory efficiency
    for col in ["id", "store", "promo"]:
        if col in df.columns:
            df[col] = df[col].astype(np.int32)

    transactions["id"] = transactions["id"].astype(np.int32)
    transactions["store"] = transactions["store"].astype(np.int32)
    transactions["date"] = pd.to_datetime(transactions["date"])
    transactions["qty"] = transactions["qty"].astype(np.float32)
    transactions["amt"] = transactions["amt"].astype(np.float32)

    # Temporal features from promodate (vectorized)
    df["promo_dayofweek"] = df["promodate"].dt.dayofweek.astype(np.int8)
    df["promo_month"] = df["promodate"].dt.month.astype(np.int8)
    df["is_weekend"] = df["promo_dayofweek"].isin([5,6]).astype(np.int8)

    # Target encoding (optimized)
    def fold_safe_te(col):
        if col not in train_df.columns:
            return {}, train_df["active"].mean()
        agg = train_df.groupby(col, observed=True)["active"].agg(["mean","count"])
        global_mean = train_df["active"].mean()
        agg["encoded"] = (agg["mean"]*agg["count"] + global_mean*smooth_k)/(agg["count"] + smooth_k)
        return agg["encoded"].to_dict(), global_mean

    for col, new_col in [("id","item_response_rate"),
                         ("store","store_response_rate"),
                         ("promo","promo_response_rate"),
                         ("category","category_response_rate"),
                         ("brand","brand_response_rate"),
                         ("manufacturer","manufacturer_response_rate")]:
        if col in df.columns:
            te_map, global_mean = fold_safe_te(col)
            df[new_col] = df[col].map(te_map).fillna(global_mean).astype(np.float32)

    # Merge past-only transactions ONCE
    trans_fold = transactions[transactions["id"].isin(df["id"])].copy()
    trans_fold = trans_fold.merge(
        df[["id","store","category","brand","promodate"]].drop_duplicates(),
        on="id", how="inner", suffixes=("", "_promo")
    )
    trans_fold = trans_fold[trans_fold["date"] < trans_fold["promodate"]]

    if len(trans_fold) == 0:
        print("WARNING: No past transactions found! Returning basic features only.")
        # Return df with minimal features to avoid crash
        df["recency"] = 999.0
        df["tx_count"] = 0
        df.drop(columns=['promodate'], inplace=True, errors='ignore')
        return df

    trans_fold["days_diff"] = (trans_fold["promodate"] - trans_fold["date"]).dt.days.astype(np.int16)

    # SINGLE MEGA AGGREGATION for ID level (minimize groupby calls)
    id_agg = trans_fold.groupby("id", observed=True).agg(
        qty_sum=("qty","sum"),
        qty_mean=("qty","mean"),
        amt_sum=("amt","sum"),
        amt_mean=("amt","mean"),
        amt_std=("amt","std"),
        tx_count=("qty","count"),
        last_purchase=("date","max"),
        first_purchase=("date","min"),
        unique_categories=("category","nunique"),
        unique_brands=("brand","nunique"),
        recent_purchase_count=("days_diff", lambda x: (x <= recent_window_days).sum())
    ).reset_index()

    df = df.merge(id_agg, on="id", how="left")

    # Vectorized datetime conversions
    df["recency"] = (df["promodate"] - df["last_purchase"]).dt.days.astype(np.float32)
    df["customer_tenure_days"] = ((df["last_purchase"] - df["first_purchase"]).dt.days).astype(np.float32)

    # Diversity features (vectorized)
    df['category_diversity'] = (df['unique_categories'] / df['tx_count'].replace(0, 1)).astype(np.float32)
    df['brand_diversity'] = (df['unique_brands'] / df['tx_count'].replace(0, 1)).astype(np.float32)

    # Store aggregates (single pass)
    store_agg = trans_fold.groupby("store", observed=True).agg(
        store_qty_mean=("qty","mean"),
        store_amt_mean=("amt","mean"),
        store_tx_count=("qty","count")
    ).reset_index()
    df = df.merge(store_agg, on="store", how="left")

    # Store affinity (optimized)
    id_store_counts = trans_fold.groupby(['id','store'], observed=True).size().reset_index(name='id_store_tx')
    df = df.merge(id_store_counts, on=['id','store'], how='left')
    df['id_store_tx'] = df['id_store_tx'].fillna(0).astype(np.int16)
    df['customer_store_affinity'] = (df['id_store_tx'] / df['tx_count'].replace(0, 1)).astype(np.float32)

    # Brand aggregates (single pass)
    brand_agg = trans_fold.groupby(['id','brand'], observed=True).agg(
        brand_qty_sum=('qty','sum'),
        brand_amt_sum=('amt','sum'),
        brand_tx_count=('qty','count'),
        brand_last_purchase=('date','max')
    ).reset_index()
    df = df.merge(brand_agg, on=['id','brand'], how='left')

    df['brand_last_purchase_recency'] = ((df['promodate'] - df['brand_last_purchase']).dt.days
                                        .fillna(recent_window_days+1).astype(np.float32))
    df['brand_loyalty_score'] = (df['brand_tx_count'].fillna(0) / df['tx_count'].replace(0, 1)).astype(np.float32)

    # Category aggregates (single pass)
    category_agg = trans_fold.groupby(['id','category'], observed=True).agg(
        category_qty_sum=('qty','sum'),
        category_amt_sum=('amt','sum'),
        category_tx_count=('qty','count'),
        category_last_purchase=('date','max')
    ).reset_index()
    df = df.merge(category_agg, on=['id','category'], how='left')

    df['category_last_purchase_recency'] = ((df['promodate'] - df['category_last_purchase']).dt.days
                                            .fillna(recent_window_days+1).astype(np.float32))
    df['purchased_category_before'] = (df['category_tx_count'].fillna(0) > 0).astype(np.int8)

    # Category share
    df['brand_share_in_category'] = (df['brand_tx_count'].fillna(0) /
                                      df['category_tx_count'].fillna(1).replace(0, 1)).astype(np.float32)

    # Manufacturer aggregates (single pass, only if exists)
    if 'manufacturer' in df.columns:
        mfr_agg = trans_fold.groupby(['id','manufacturer'], observed=True).agg(
            mfr_tx_count=('qty','count'),
            mfr_last_purchase=('date','max')
        ).reset_index()
        df = df.merge(mfr_agg, on=['id','manufacturer'], how='left')
        df['mfr_last_purchase_recency'] = ((df['promodate'] - df['mfr_last_purchase']).dt.days
                                           .fillna(recent_window_days+1).astype(np.float32))
        df['mfr_tx_count'] = df['mfr_tx_count'].fillna(0).astype(np.int16)

    # Price sensitivity (vectorized)
    df['avg_amt_per_transaction'] = (df['amt_sum'] / df['tx_count'].replace(0, 1)).astype(np.float32)
    df['amt_coefficient_variation'] = (df['amt_std'] / df['amt_mean'].replace(0, np.nan)).fillna(0).astype(np.float32)

    if 'promoval' in df.columns:
        df['promoval_vs_customer_avg'] = (df['promoval'] / df['avg_amt_per_transaction'].replace(0, 1)).astype(np.float32)
    if 'promoqty' in df.columns:
        df['promoqty_vs_customer_avg_qty'] = (df['promoqty'] / df['qty_mean'].replace(0, 1)).astype(np.float32)

    # Day of week patterns (optimized with observed=True)
    dow_data = trans_fold[['id', 'date']].copy()
    dow_data['dayofweek'] = dow_data['date'].dt.dayofweek
    dow_counts = dow_data.groupby(['id', 'dayofweek'], observed=True).size().unstack(fill_value=0)
    dow_counts.columns = [f'dow_{int(i)}_count' for i in dow_counts.columns]
    df = df.merge(dow_counts.reset_index(), on='id', how='left')
    del dow_data

    for i in range(7):
        col = f'dow_{i}_count'
        if col in df.columns:
            df[col] = df[col].fillna(0).astype(np.int16)

    # Weekend shopper (vectorized)
    if 'dow_5_count' in df.columns and 'dow_6_count' in df.columns:
        df['is_weekend_shopper'] = ((df['dow_5_count'] + df['dow_6_count']) /
                                     df['tx_count'].replace(0, 1) > 0.5).astype(np.int8)

    # Log transforms (vectorized on key features only)
    for col in ["tx_count", "brand_tx_count", "category_tx_count", "recent_purchase_count"]:
        if col in df.columns:
            df[f"log1p_{col}"] = np.log1p(df[col].fillna(0)).astype(np.float32)

    # Ratios (vectorized, most important only)
    df["qty_sum_ratio_store"] = (df["qty_sum"] / df["store_qty_mean"].replace(0, 1)).astype(np.float32)
    df["amt_sum_ratio_store"] = (df["amt_sum"] / df["store_amt_mean"].replace(0, 1)).astype(np.float32)

    # Key interaction features (vectorized)
    df['recency_x_brand_loyalty'] = (df['recency'] * df['brand_loyalty_score']).astype(np.float32)
    df['category_x_brand_response'] = (df['category_response_rate'] * df['brand_response_rate']).astype(np.float32)

    # Competition features (optimized - single window)
    recent_window = trans_fold[trans_fold["days_diff"] <= 30]
    store_cat_comp = recent_window.groupby(["store","category"], observed=True).size().to_dict()
    df["competing_promos_store_30d"] = (
        df.set_index(["store","category"]).index.map(store_cat_comp).fillna(0).astype(np.int16)
    )

    datetime_cols = ['promodate','last_purchase','first_purchase','brand_last_purchase',
                     'category_last_purchase','mfr_last_purchase']
    df.drop(columns=[c for c in datetime_cols if c in df.columns], inplace=True, errors='ignore')

    # Fill any remaining NaN values with 0
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)

    # Aggressive memory cleanup
    del trans_fold, id_agg, store_agg, brand_agg, category_agg, id_store_counts, recent_window
    if 'mfr_agg' in locals():
        del mfr_agg
    gc.collect()

    print(f"Completed! Generated {len(df.columns)} features.")

    return df

# Build Model

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(train_history, test_size=0.2, random_state=42, stratify=train_history["active"])

In [13]:
# Prepare features
import gc
X_train_feat = feature_engineering(X_train, transactions, train_df=X_train, promos=promos)
y_train = X_train["active"]
X_val_feat = feature_engineering(X_val, transactions, train_df=X_train, promos=promos)
y_val = X_val["active"]
X_test = feature_engineering(test_history, transactions, train_df=train_history, promos=promos)

exclude_cols = ["id","promodate","active","last_purchase","first_purchase"] # Drop date-time items

# # Drop features with feature-importance <= 50
# exclude_cols += [ "dow_6_count", "qty_sum",
#           "amt_sum_ratio_store", "category_response_rate", "category_qty_sum", "mfr_last_purchase_recency",
#           "store_tx_count", "recency_x_brand_loyalty", "brand_share_in_category", "qty_sum_ratio_store",
#           "tx_count", "avg_amt_per_transaction", "mfr_tx_count", "log1p_category_tx_count",
#           "promo_dayofweek", "store", "brand_loyalty_score", "store_amt_mean",
#           "manufacturer", "region", "brand_tx_count", "promoval",
#           "store_qty_mean", "competing_promos_store_30d", "promo", "brand",
#           "brand_qty_sum", "brand_response_rate", "log1p_recent_purchase_count", "category",
#           "log1p_tx_count", "log1p_brand_tx_count","category_x_brand_response", "purchased_category_before",
#           "manufacturer_response_rate", "is_weekend", "is_weekend_shopper", "promoqty",
#           "customer_store_affinity", "promo_month"
#       ]

features = [c for c in X_train_feat.columns if c not in exclude_cols]

Processing 16000 records...
Completed! Generated 71 features.
Processing 4000 records...
Completed! Generated 71 features.
Processing 10000 records...
Completed! Generated 71 features.


In [14]:
# # Hyper-parameter tuning
# from lightgbm import LGBMClassifier, early_stopping, log_evaluation
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import ParameterGrid, train_test_split

# X_tr, X_val, y_tr, y_val = train_test_split(
#     X_train_feat[features], y_train, test_size=0.2, random_state=42, stratify=y_train
# )

# # Parameter grid
# param_grid = {
#     'num_leaves': [20, 30, 40],
#     'max_depth': [10, 12],
#     'learning_rate': [0.01, 0.02],
#     'min_child_samples': [20, 30],
#     'lambda_l1': [0.1, 0.5],
#     'lambda_l2': [0.1, 0.5],
#     'feature_fraction': [0.7, 0.8]
# }

# best_auc = 0
# best_params = None

# for params in ParameterGrid(param_grid):
#     model = LGBMClassifier(
#         n_estimators=2000,
#         objective='binary',
#         random_state=42,
#         n_jobs=-1,
#         **params
#     )

#     # Train with early stopping using callbacks
#     model.fit(
#         X_tr, y_tr,
#         eval_set=[(X_val, y_val)],
#         eval_metric='auc',
#         callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=0)]
#     )

#     val_preds = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:,1]
#     auc = roc_auc_score(y_val, val_preds)

#     print(f"Params: {params} | Val AUC: {auc:.4f}")

#     if auc > best_auc:
#         best_auc = auc
#         best_params = params

# print("\nBest AUC:", best_auc)
# print("Best params:", best_params)


[LightGBM] [Info] Number of positive: 2592, number of negative: 10208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9954
[LightGBM] [Info] Number of data points in the train set: 12800, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.202500 -> initscore=-1.370742
[LightGBM] [Info] Start training from score -1.370742
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[350]	valid_0's auc: 0.709822	valid_0's binary_logloss: 0.45228
Params: {'feature_fraction': 0.7, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_samples': 20, 'num_leaves': 20} | Val AUC: 0.7098
[LightGBM] [Info] Number of positive: 2592, number of negative: 10208
[LightGBM] [Info] Auto-choosing col-wise m

In [19]:
# Final Model
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_feat[features], y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)


best_params = {
    'feature_fraction': 0.8,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'learning_rate': 0.02,
    'max_depth': 12,
    'min_child_samples': 20,
    'num_leaves': 40
}

final_model = LGBMClassifier(
    n_estimators=2000,
    objective='binary',
    random_state=42,
    n_jobs=-1,
    **best_params
)

final_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=50),
               log_evaluation(period=50)]
)

train_preds = final_model.predict_proba(
    X_tr, num_iteration=final_model.best_iteration_
)[:,1]

val_preds = final_model.predict_proba(
    X_val, num_iteration=final_model.best_iteration_
)[:,1]

train_auc = roc_auc_score(y_tr, train_preds)
val_auc = roc_auc_score(y_val, val_preds)

print(f"Train AUC: {train_auc:.4f}")
print(f"Validation AUC: {val_auc:.4f}")


[LightGBM] [Info] Number of positive: 2592, number of negative: 10208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9954
[LightGBM] [Info] Number of data points in the train set: 12800, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.202500 -> initscore=-1.370742
[LightGBM] [Info] Start training from score -1.370742
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.704308	valid_0's binary_logloss: 0.463714
[100]	valid_0's auc: 0.708366	valid_0's binary_logloss: 0.456065
[150]	valid_0's auc: 0.713195	valid_0's binary_logloss: 0.452058
[200]	valid_0's auc: 0.712723	valid_0's binary_logloss: 0.452164
Early stopping, best iteration is:
[179]	valid_0's auc: 0.713407	valid_0's binary_logloss: 0.451874
Train AUC: 0.8751
Validation AUC: 0.7134


In [23]:
# Feature Importance
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

importance_df = pd.DataFrame({
    "feature": features,
    "importance": final_model.feature_importances_
})

importance_df = importance_df.sort_values(by="importance", ascending=False).reset_index(drop=True)

print(importance_df)

                           feature  importance
0              store_response_rate         346
1                 category_amt_sum         280
2         promoval_vs_customer_avg         275
3                      dow_0_count         249
4                         amt_mean         245
5                      dow_2_count         238
6                      dow_1_count         233
7                      dow_3_count         229
8        amt_coefficient_variation         222
9             customer_tenure_days         216
10                     dow_4_count         214
11                        qty_mean         212
12  category_last_purchase_recency         205
13                     dow_5_count         197
14                         amt_std         195
15              category_diversity         194
16                 brand_diversity         188
17                     dow_6_count         184
18               unique_categories         173
19             promo_response_rate         166
20       mfr_

# Generate Prediction

In [24]:
import pandas as pd
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

X_train_feat = feature_engineering(train_history, transactions, train_df=train_history)
y_train = train_history["active"]

X_test_feat = feature_engineering(test_history, transactions, train_df=train_history)

final_model = LGBMClassifier(
    n_estimators=2000,
    objective='binary',
    random_state=42,
    n_jobs=-1,
    **best_params
)

final_model.fit(
    X_train_feat[features], y_train,
    eval_set=[(X_train_feat[features], y_train)],  # Optional early stopping on training
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=50),
               log_evaluation(period=100)]
)

test_preds = final_model.predict_proba(
    X_test_feat[features], num_iteration=final_model.best_iteration_
)[:,1]

predict = pd.DataFrame({
    "id": X_test_feat["id"],
    "active": test_preds
})

predict.to_csv("predict.csv", index=False)

print("Test predictions saved to 'test_predictions.csv'.")


Processing 20000 records...
Completed! Generated 71 features.
Processing 10000 records...
Completed! Generated 71 features.
[LightGBM] [Info] Number of positive: 4050, number of negative: 15950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015605 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10097
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.202500 -> initscore=-1.370742
[LightGBM] [Info] Start training from score -1.370742
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.79702	valid_0's binary_logloss: 0.424108
[200]	valid_0's auc: 0.841406	valid_0's binary_logloss: 0.395228
[300]	valid_0's auc: 0.87996	valid_0's binary_logloss: 0.371911
[400]	valid_0's auc: 0.907964	valid_0's binary_logloss: 0.351475
[500]	valid_0's auc: 0.928303	valid_0's binary_logloss: 0.33326

In [25]:
from google.colab import files

files.download("predict.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>