In [1]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pyarrow.parquet as pa
import pyspark as ps
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import arrays_zip, explode, col
from tqdm import tqdm
from catboost import CatBoostClassifier

In [2]:
data = pd.read_parquet('train.parquet')

In [3]:
data.head()

Unnamed: 0,id,dates,values,label
0,19114,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-1.86, 0.79, 1.4, 0.15, 0.0, -1.24, -1.46, 3....",0.0
1,22769,"[2016-05-01, 2016-06-01, 2016-07-01, 2016-08-0...","[-1.04, -3.48, 0.05, -0.13, -0.01, 0.03, 0.27,...",1.0
2,76935,"[2017-03-01, 2017-04-01, 2017-05-01, 2017-06-0...","[0.28, 0.63, 0.06, 0.96, -1.4, -0.3, 1.62, 1.1...",0.0
3,66297,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-0.33, 0.58, 1.1, -0.56, -0.95, -0.61, -0.7, ...",0.0
4,2191,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[1.31, 0.5, -0.54, 0.95, 0.65, 0.83, -1.55, -0...",0.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      80000 non-null  int64  
 1   dates   80000 non-null  object 
 2   values  80000 non-null  object 
 3   label   80000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.4+ MB


In [5]:
data.isna().sum()

id        0
dates     0
values    0
label     0
dtype: int64

In [6]:
all_unique_dates = sorted(set([date for sublist in data['dates'] for date in sublist]))
for date in all_unique_dates:
    data[date] = np.nan
for index, row in data.iterrows():
    for date, value in zip(row['dates'], row['values']):
        data.at[index, date] = value

In [7]:
data = data.drop(['dates', 'values'], axis= 1)

In [8]:
data.head()

Unnamed: 0,id,label,2016-01-01,2016-02-01,2016-03-01,2016-04-01,2016-05-01,2016-06-01,2016-07-01,2016-08-01,...,2023-04-01,2023-05-01,2023-06-01,2023-07-01,2023-08-01,2023-09-01,2023-10-01,2023-11-01,2023-12-01,2024-01-01
0,19114,0.0,-1.86,0.79,1.4,0.15,0.0,-1.24,-1.46,3.49,...,,,,,,,,,,
1,22769,1.0,,,,,-1.04,-3.48,0.05,-0.13,...,,,,,,,,,,
2,76935,0.0,,,,,,,,,...,,,,,,,,,,
3,66297,0.0,-0.33,0.58,1.1,-0.56,-0.95,-0.61,-0.7,-1.35,...,,,,,,,,,,
4,2191,0.0,1.31,0.5,-0.54,0.95,0.65,0.83,-1.55,-0.28,...,,,,,,,,,,


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 99 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          80000 non-null  int64  
 1   label       80000 non-null  float64
 2   2016-01-01  45687 non-null  float64
 3   2016-02-01  52783 non-null  float64
 4   2016-03-01  55387 non-null  float64
 5   2016-04-01  57194 non-null  float64
 6   2016-05-01  58400 non-null  float64
 7   2016-06-01  59168 non-null  float64
 8   2016-07-01  60195 non-null  float64
 9   2016-08-01  60813 non-null  float64
 10  2016-09-01  61464 non-null  float64
 11  2016-10-01  62159 non-null  float64
 12  2016-11-01  62769 non-null  float64
 13  2016-12-01  63264 non-null  float64
 14  2017-01-01  66743 non-null  float64
 15  2017-02-01  67595 non-null  float64
 16  2017-03-01  68268 non-null  float64
 17  2017-04-01  68874 non-null  float64
 18  2017-05-01  69410 non-null  float64
 19  2017-06-01  69982 non-nul

In [10]:
data.columns = data.columns.astype(str)

In [11]:
data.fillna(data.mean(), inplace= True)

In [12]:
data.head()

Unnamed: 0,id,label,2016-01-01,2016-02-01,2016-03-01,2016-04-01,2016-05-01,2016-06-01,2016-07-01,2016-08-01,...,2023-04-01,2023-05-01,2023-06-01,2023-07-01,2023-08-01,2023-09-01,2023-10-01,2023-11-01,2023-12-01,2024-01-01
0,19114,0.0,-1.86,0.79,1.4,0.15,0.0,-1.24,-1.46,3.49,...,-0.326724,-0.396081,-0.558423,-0.60824,-0.451414,-0.52876,-0.251898,-0.118455,0.105253,0.221075
1,22769,1.0,0.432776,0.340376,0.22714,0.01332,-1.04,-3.48,0.05,-0.13,...,-0.326724,-0.396081,-0.558423,-0.60824,-0.451414,-0.52876,-0.251898,-0.118455,0.105253,0.221075
2,76935,0.0,0.432776,0.340376,0.22714,0.01332,-0.166542,-0.197952,-0.260433,-0.134852,...,-0.326724,-0.396081,-0.558423,-0.60824,-0.451414,-0.52876,-0.251898,-0.118455,0.105253,0.221075
3,66297,0.0,-0.33,0.58,1.1,-0.56,-0.95,-0.61,-0.7,-1.35,...,-0.326724,-0.396081,-0.558423,-0.60824,-0.451414,-0.52876,-0.251898,-0.118455,0.105253,0.221075
4,2191,0.0,1.31,0.5,-0.54,0.95,0.65,0.83,-1.55,-0.28,...,-0.326724,-0.396081,-0.558423,-0.60824,-0.451414,-0.52876,-0.251898,-0.118455,0.105253,0.221075


In [13]:
x = data.drop(['label'], axis= 1)
y = data['label']
x.shape, y.shape

((80000, 98), (80000,))

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 12345)

In [15]:
params_rand = {
    'iterations' : [int(x) for x in np.linspace(start = 200, stop = 400, num = 20)],
    'depth' : [int(x) for x in np.linspace(start = 5, stop = 15, num = 5)],
    'learning_rate' : [x for x in np.linspace(start = 0.1, stop = 0.3, num = 5)],
    'l2_leaf_reg' : [int(x) for x in np.linspace(start = 2, stop = 30, num = 7)]
}

In [16]:
c_model = CatBoostClassifier(custom_metric='AUC')
rand_search = RandomizedSearchCV(c_model, params_rand, error_score='raise', cv= 3, verbose= 2)

In [17]:
rand_search.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
0:	learn: 0.6441092	total: 354ms	remaining: 1m 21s
1:	learn: 0.6055923	total: 653ms	remaining: 1m 14s
2:	learn: 0.5756744	total: 963ms	remaining: 1m 13s
3:	learn: 0.5541635	total: 1.27s	remaining: 1m 11s
4:	learn: 0.5343543	total: 1.57s	remaining: 1m 11s
5:	learn: 0.5175093	total: 1.87s	remaining: 1m 10s
6:	learn: 0.5032111	total: 2.17s	remaining: 1m 9s
7:	learn: 0.4920597	total: 2.46s	remaining: 1m 8s
8:	learn: 0.4826183	total: 2.76s	remaining: 1m 8s
9:	learn: 0.4737802	total: 3.06s	remaining: 1m 7s
10:	learn: 0.4664775	total: 3.36s	remaining: 1m 7s
11:	learn: 0.4593226	total: 3.66s	remaining: 1m 6s
12:	learn: 0.4527954	total: 3.97s	remaining: 1m 6s
13:	learn: 0.4469378	total: 4.28s	remaining: 1m 6s
14:	learn: 0.4389407	total: 4.6s	remaining: 1m 6s
15:	learn: 0.4327253	total: 4.91s	remaining: 1m 5s
16:	learn: 0.4267304	total: 5.21s	remaining: 1m 5s
17:	learn: 0.4217061	total: 5.51s	remaining: 1m 5s
18:	learn: 0.4172065	total

In [18]:
rand_search.best_params_

{'learning_rate': 0.3, 'l2_leaf_reg': 30, 'iterations': 347, 'depth': 7}

In [19]:
print(12)

12
