<a href="https://colab.research.google.com/github/NH0917/amex/blob/main/model4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import unicodedata
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from catboost import Pool,CatBoostClassifier
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import itertools

In [None]:
train_path = "/content/drive/MyDrive/signate/cup/data/train.csv"
test_path = "/content/drive/MyDrive/signate/cup/data/test.csv"

In [None]:
train = pl.read_csv(train_path)
test = pl.read_csv(test_path)

In [None]:
age_map ={
    "一":10,
    "二":20,
    "三":30,
    "四":40,
    "五":50,
    "六":60,
    "七":70,
    "八":80,
    "九":90,
    "十":10,
    "1":10,
    "2":20,
    "3":30,
    "4":40,
    "5":50,
    "6":60,
    "7":70,
    "8":80,
    "9":90,
    "１":10,
    "２":20,
    "３":30,
    "４":40,
    "５":50,
    "６":60,
    "７":70,
    "８":80,
    "９":90,
}

gender_map ={
    "female":0,
    "male":1
}

customer_info1_map = {
    "独身":1,
"離婚済み":1,
"未婚":1,
"結婚済み":0}

customer_info2_map = {
    "車所持":1,
"車なし":0,
"自家用車なし":0,
"車未所持":0,
"自動車未所有":0,
"車あり":1,
"自家用車あり":1,
"乗用車なし":0,
"自動車所有":1,
"車保有":1,
"車保有なし":0,
"乗用車所持":1
}

customer_info3_map ={
    "子供の数不明":0,
"子の数不詳":0,
"2児":1,
"非育児家庭":0,
"子供有り(2人)":1,
"子供無し":0,
"こども2人":1,
"不明":0,
"子供なし":0,
"こども1人":1,
"こども3人":1,
"1児":1,
"わからない":0,
"子供3人":1,
"子供有り":1,
"子供有り(1人)":1,
"3児":1,
"子供1人":1,
"子育て状況不明":0,
"子供2人":1,
"無子":0,
"子供有り(3人)":1,
"子供ゼロ":0

}

product_map ={
    "basiс":"basic",
    "вasic":"basic",
    "superdeluxe":"superdeluxe",
    "delu×e":"deluxe",
    "βasic":"basic",
    "de|uxe":"deluxe",
    "kıոg":"king",
    "տuperdeluxe":"superdeluxe",
    "standard":"standard",
    "kiոg":"king",
    "ꭰeluxe":"deluxe",
    "basιc":"basic",
    "superdεluxe":"superdeluxe",
    "superꭰeluxe":"superdeluxe",
    "𐊡asic":"basic",
    "king":"king",
    "ꓢuperdeluxe":"superdeluxe",
    "deluxε":"deluxe",
    "baꓢic":"basic",
    "baտic":"basic",
    "ѕuperdeluxe":"superdeluxe",
    "basıς":"basic",
    "տtandard":"standard",
    "basiς":"basic",
    "basic":"basic",
    "staոdard":"standard",
    "deluxe":"deluxe",
    "stanᗞard":"standard",
    "ᗞeluxe":"deluxe",
    "βasıc":"basic",
    "bαsic":"basic",
    "superde|uxe":"superdeluxe",
    "standarꭰ":"standard",
    "de|u×e":"deluxe",
    "kıng":"king",
    "ꓢtandard":"standard",
    "ꓢuperde|uxe":"superdeluxe",
    "superᗞeluxe":"superdeluxe",
    "ѕtandard":"standard",
    "βasiс":"basic",
    "dεluxε":"deluxe",
    "basıc":"basic"
}

typeofcontact_map = {
    "Self Enquiry":0,
    "Company Invited":1,
}

In [None]:
def preprocessing(df):
  #Age cleansing
  df = df.with_columns(pl.col("Age").fill_null(0))
  df = df.with_columns(pl.col("Age").str.slice(0,1))
  df = df.with_columns(pl.col("Age").replace(age_map))
  df = df.with_columns(pl.col("Age").cast(pl.Int64))

  #cvt duration of pitch sec->min
  df = df.with_columns(
    pl.when(
    pl.col("DurationOfPitch").str.contains("秒"))
    .then(pl.lit("Y").alias("DurationFlag"))
    .otherwise(pl.lit("N").alias("DurationFlag"))
)

  df = df.with_columns(pl.col("DurationOfPitch").str.replace("秒",""))
  df = df.with_columns(pl.col("DurationOfPitch").str.replace("分",""))
  df = df.with_columns(pl.col("DurationOfPitch").cast(pl.Int64))
  df = df.with_columns(
      pl.when(pl.col("DurationFlag")=="Y").then(pl.col("DurationOfPitch")/60).otherwise(pl.col("DurationOfPitch"))
  )

  df = df.with_columns(pl.col("DurationOfPitch")).fill_null(0)

  #cvt gender to one-hot
  df = df.with_columns(pl.col("Gender").map_elements(lambda x: unicodedata.normalize("NFKC",x)).str.to_lowercase().str.replace(" ",""))
  df = df.with_columns(pl.col("Gender").replace(gender_map))

  #cvt customer_info
  df = df.with_columns(pl.col("customer_info").str.replace_all(",|/|／|、|　|\t|\n| ","-"))
  df = df.with_columns(pl.col("customer_info").str.split("-"))
  df = df.with_columns((pl.col("customer_info").list.get(0)).alias("customer_info1"))
  df = df.with_columns((pl.col("customer_info").list.get(1)).alias("customer_info2"))
  df = df.with_columns((pl.col("customer_info").list.get(2)).alias("customer_info3"))

  df = df.with_columns(pl.col("customer_info1").replace(customer_info1_map))
  df = df.with_columns(pl.col("customer_info2").replace(customer_info2_map))
  df = df.with_columns(pl.col("customer_info3").replace(customer_info3_map))

  #NumberOfFollowups
  df = df.with_columns(
    pl.when(pl.col("NumberOfFollowups")>=100).then(pl.col("NumberOfFollowups")/100).otherwise(pl.col("NumberOfFollowups")
  )
  )

  #cvt number of trips
  df = df.with_columns(
    pl.when(
        pl.col("NumberOfTrips").str.contains("半年に")
    ).then(pl.lit(2).alias("MultiNum")
    ).when(pl.col("NumberOfTrips").str.contains("四半期に")
    ).then(pl.lit(4).alias("MultiNum")
    ).otherwise(pl.lit(1).alias("MultiNum")
        )
    )

  #product
  df = df.with_columns(pl.col("ProductPitched").map_elements(lambda x: unicodedata.normalize("NFKC",x)).str.to_lowercase().str.replace(" ",""))
  df = df.with_columns(
      pl.col("ProductPitched").replace(product_map).fill_null("other")
  )

  df = df.with_columns(pl.col("NumberOfTrips").str.extract(r".*(\d{1,2})"))
  df = df.with_columns(pl.col("NumberOfTrips").cast(pl.Int64()))
  df = df.with_columns((pl.col("NumberOfTrips")*pl.col("MultiNum")).alias("NumberOfTrips_cvt"))

  df = df.with_columns(
    pl.when(pl.col("MonthlyIncome").str.contains("万")
    ).then(pl.lit("Yes").alias("MonthlyIncomeFlag")).otherwise(pl.lit("No").alias("MonthlyIncomeFlag")))
  df = df.with_columns(
      pl.when(pl.col("MonthlyIncome").str.contains("万")
  ).then(pl.col("MonthlyIncome").str.extract(r"月収(\d{1,3})")).otherwise(pl.col("MonthlyIncome"))
  )
  df = df.with_columns(
      pl.when(pl.col("MonthlyIncomeFlag").eq("Yes")).then(pl.col("MonthlyIncome").cast(pl.Float64())*10000).otherwise(pl.col("MonthlyIncome").cast(pl.Float64())
  ))
  df = df.drop("MonthlyIncomeFlag")

  df = df.with_columns(pl.col("TypeofContact").replace(typeofcontact_map).fill_null(2))

  df = df.fill_null(0)

  return df

In [None]:
cat_features = ["Age","TypeofContact","CityTier","Gender","NumberOfPersonVisiting","NumberOfFollowups","PreferredPropertyStar","ProductPitched","NumberOfTrips_cvt","Passport","PitchSatisfactionScore","customer_info1","customer_info2","customer_info3","MonthlyIncome","DurationOfPitch"]
feature_col = cat_features.copy()

In [None]:
train = preprocessing(train)
train = train.select(feature_col+["ProdTaken"])

#train = train.with_columns(pl.col(cat_features).cast(pl.Int64()))
train = train.to_pandas()

test = preprocessing(test)
test = test.select(feature_col)

#train = train.with_columns(pl.col(cat_features).cast(pl.Int64()))
test = test.to_pandas()



In [None]:
customer_group = ["CityTier","PreferredPropertyStar","NumberOfTrips_cvt","Passport","MonthlyIncome_ceil_10_000","MonthlyIncome_ceil_100_000","customer_info1","customer_info2","customer_info3","ProductPitched"]

In [None]:
pair_col = []

for i in range(1,10):
  for pair in itertools.combinations(customer_group,i):
    pair = list(pair)
    pair.extend(["Age","Gender"])
    pair_col.append(pair)

In [None]:
def customer_pair(df,col):
  temp = df[col].copy()
  temp[col] = temp[col].astype(str)
  create_feature = temp[col].apply(lambda x: "_".join(x),axis=1)
  return create_feature

In [None]:
def create_monthlyincome_feature(df):
  df["MonthlyIncome_ceil_10_000"] = df["MonthlyIncome"].apply(lambda x: (x//10_000)*10_000)
  df["MonthlyIncome_ceil_100_000"] = df["MonthlyIncome"].apply(lambda x: (x//100_000)*100_000)
  return df

In [None]:
train = create_monthlyincome_feature(train)
test = create_monthlyincome_feature(test)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
create_col = []
for col in pair_col:
  col_name = "_".join(col)
  train[col_name] = customer_pair(train,col)
  create_col.append(col_name)

In [None]:
for col in pair_col:
  col_name = "_".join(col)
  test[col_name] = customer_pair(test,col)
  create_col.append(col_name)

In [None]:
train["ProductPitched_PitchSatisfactionScore"] = train["ProductPitched"]+"_"+train["PitchSatisfactionScore"].astype(str)
test["ProductPitched_PitchSatisfactionScore"] = test["ProductPitched"]+"_"+test["PitchSatisfactionScore"].astype(str)

In [None]:
for col in train.columns:
  if train[col].dtype == "float64":
    train[col] = train[col].astype(int)

In [None]:
for col in test.columns:
  if test[col].dtype == "float64":
    test[col] = test[col].astype(int)

In [None]:
cat_features = ['ProductPitched', 'ProductPitched_PitchSatisfactionScore',
       'Passport_customer_info1_Age_Gender', 'Passport_Age_Gender',
       'MonthlyIncome_ceil_100_000', 'Passport',
       'customer_info1_customer_info3_Age_Gender',
       'Passport_customer_info2_Age_Gender', 'NumberOfPersonVisiting',
       'PreferredPropertyStar', 'Age',
       'NumberOfTrips_cvt_Passport_Age_Gender', 'NumberOfTrips_cvt',
       'Passport_customer_info1_customer_info2_Age_Gender',
       'Passport_MonthlyIncome_ceil_100_000_customer_info3_Age_Gender',
       'CityTier', 'customer_info1_Age_Gender',
       'Passport_customer_info1_customer_info3_Age_Gender',
       'Passport_MonthlyIncome_ceil_100_000_Age_Gender',
       'PitchSatisfactionScore',
       'PreferredPropertyStar_Passport_customer_info1_Age_Gender']

train = train[cat_features+["ProdTaken"]]
test = test[cat_features]

In [None]:
random_state = 0

auc_list = []
models_list = []

oof_dfs = pd.DataFrame()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
for i,(train_index,val_index) in enumerate(skf.split(train,train["ProdTaken"])):

    train_df = train.loc[train_index,:].reset_index(drop=True)
    val_df = train.loc[val_index,:].reset_index(drop=True)

    pos_count = int(train_df["ProdTaken"].value_counts()[1]*0.8)
    neg_count = int(train_df["ProdTaken"].value_counts()[0]*0.8)

    oof_df = pd.DataFrame()
    for seed in range(5):
      rus = RandomUnderSampler(random_state=seed,sampling_strategy={0:pos_count,1:pos_count})
      train_X,train_y = rus.fit_resample(train_df.drop(columns="ProdTaken",axis=1),train_df["ProdTaken"])
      train_pool = Pool(train_X,train_y,cat_features=cat_features)
      val_pool = Pool(val_df.drop("ProdTaken",axis=1),val_df["ProdTaken"],cat_features=cat_features)
      model = CatBoostClassifier(random_seed=random_state,eval_metric="AUC",scale_pos_weight=0.85)
      model.fit(train_pool,eval_set=val_pool,early_stopping_rounds=10,use_best_model=True,verbose=False)

      y_pred = model.predict_proba(val_pool)[:,1]
      oof_df[f"seed_{seed}"] = y_pred
      model.save_model(f"model_{i}_{seed}")
    val_df[["pred_1","pred2","pred3","pred4","pred5"]] = oof_df
    oof_dfs = pd.concat([oof_dfs,val_df[["pred_1","pred2","pred3","pred4","pred5","ProdTaken"]]],axis="index")


In [None]:
roc_auc_score(oof_dfs["ProdTaken"],oof_dfs["predict"])

0.8163281157533436

In [None]:
roc_auc_score(oof_dfs["ProdTaken"],oof_dfs["predict"])

0.8163281157533436

In [None]:
oof_dfs.reset_index(drop=True,inplace=True)

In [None]:
oof_dfs["pred_1"]

Unnamed: 0,pred_1
0,0.716883
1,0.325148
2,0.338005
3,0.636048
4,0.367850
...,...
3484,0.406039
3485,0.405346
3486,0.417471
3487,0.418308


In [None]:
oof_dfs.head()

Unnamed: 0,pred_1,pred2,pred3,pred4,pred5,ProdTaken
0,0.716883,0.73907,0.692156,0.709663,0.620307,1
1,0.325148,0.345831,0.37903,0.355938,0.414696,0
2,0.338005,0.3543,0.363594,0.321599,0.425877,0
3,0.636048,0.608678,0.565658,0.694815,0.576913,1
4,0.36785,0.333119,0.351033,0.351911,0.423099,0


In [None]:
rank_df = pd.DataFrame()
rank_df["pred_1"] = oof_dfs["pred_1"].rank(ascending=True)
rank_df["pred_2"] = oof_dfs["pred2"].rank(ascending=True)
rank_df["pred_3"] = oof_dfs["pred3"].rank(ascending=True)
rank_df["pred_4"] = oof_dfs["pred4"].rank(ascending=True)
rank_df["pred_5"] = oof_dfs["pred5"].rank(ascending=True)

In [None]:
rank_df["pred"] = rank_df.mean(axis=1)

In [None]:
rank_df["pre"] = rank_df["pred"]/rank_df.shape[0]

In [None]:
rank_df["pre"]

Unnamed: 0,pre
0,0.975007
1,0.330868
2,0.325824
3,0.914761
4,0.329149
...,...
3484,0.111178
3485,0.117598
3486,0.123388
3487,0.301691


In [None]:
rank_df["pred"] = rank_df.shape[0]-rank_df["pred"]

In [None]:
rank_df

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred,pre
0,3481.0,3410.0,3289.0,3429.0,3400.0,87.2,0.975007
1,282.0,1142.0,1704.0,1091.0,1553.0,2334.6,0.330868
2,339.0,1276.0,1518.0,740.0,1811.0,2352.2,0.325824
3,3396.0,3090.0,2828.0,3401.0,3243.0,297.4,0.914761
4,564.0,948.0,1449.0,1032.0,1749.0,2340.6,0.329149
...,...,...,...,...,...,...,...
3484,1143.5,264.0,313.0,130.0,89.0,3101.1,0.111178
3485,1131.5,254.0,287.0,195.0,184.0,3078.7,0.117598
3486,1358.0,432.0,226.0,44.0,92.5,3058.5,0.123388
3487,1388.0,953.0,615.0,1517.0,790.0,2436.4,0.301691


In [None]:
roc_auc_score(oof_dfs["ProdTaken"],rank_df["pre"])

0.816615602707152

In [None]:
roc_auc_score(oof_dfs["ProdTaken"],rank_df["pre"])

0.816615602707152

In [None]:
features = []
for i in range(5):
    model = CatBoostClassifier()
    model.load_model(f"model_{i}")
    train_df = pd.read_csv(f"model_{i}.csv")
    train_pool = Pool(train_df.drop("ProdTaken",axis=1),train_df["ProdTaken"],cat_features=cat_features)
    feature = model.get_feature_importance(train_pool)
    features.append(feature)

CatBoostError: /src/catboost/catboost/libs/model/model_import_interface.h:19: Model file doesn't exist: model_0

In [None]:
temp = pd.DataFrame(features,columns=feature_col)
temp = temp.mean().reset_index(drop=False).rename(columns={
    "index":"feature",
    0:"importance"
})

In [None]:
sns.barplot(y="feature",x="importance",data=temp.query("importance!=0"))

In [None]:
test

In [None]:
sub = pd.read_csv(test_path)

In [None]:

test_pool = Pool(test,cat_features=cat_features)

pred_df = pd.DataFrame()
pred_df["id"] = sub["id"].copy()

for i in range(5):
  for j in range(5):
    model = CatBoostClassifier()
    model.load_model(f"model_{i}_{j}")
    y_pred = model.predict_proba(test_pool)[:,1]
    pred_df[f"model_{i}_{j}"] = y_pred

In [None]:
pred_df["pred"] = pred_df.iloc[:,1:].mean(axis=1)

In [None]:
pred_df[["id","pred"]].to_csv("submission.csv",index=False,header=False)

In [None]:
pred_df