In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score

In [2]:
train_df = pd.read_csv("./inputs/train.csv").set_index("id")
target = train_df["rainfall"]
train_df.drop("day", axis=1, inplace=True)
train_df.drop("rainfall", axis=1, inplace=True)
train_df.head()

Unnamed: 0_level_0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2
1,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9
2,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1
3,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6
4,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8


In [3]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
clf = CatBoostClassifier(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=42)
clf.load_model("./outputs/catboost.model")
clf.fit(X_train_scaled, y_train, init_model=clf)
clf.save_model("./outputs/catboost.model")
y_pred = clf.predict(X_test_scaled)
score = roc_auc_score(y_test, y_pred)
print(score)

0:	learn: 0.2218952	total: 16.4ms	remaining: 16.4s
1:	learn: 0.2217626	total: 26.5ms	remaining: 13.2s
2:	learn: 0.2216700	total: 76.9ms	remaining: 25.6s
3:	learn: 0.2215699	total: 92.7ms	remaining: 23.1s
4:	learn: 0.2215290	total: 108ms	remaining: 21.4s
5:	learn: 0.2214575	total: 116ms	remaining: 19.2s
6:	learn: 0.2213387	total: 128ms	remaining: 18.1s
7:	learn: 0.2211323	total: 161ms	remaining: 20s
8:	learn: 0.2210848	total: 182ms	remaining: 20s
9:	learn: 0.2210585	total: 201ms	remaining: 19.9s
10:	learn: 0.2210305	total: 213ms	remaining: 19.1s
11:	learn: 0.2210034	total: 226ms	remaining: 18.6s
12:	learn: 0.2209129	total: 263ms	remaining: 20s
13:	learn: 0.2208869	total: 330ms	remaining: 23.2s
14:	learn: 0.2208609	total: 368ms	remaining: 24.1s
15:	learn: 0.2207625	total: 378ms	remaining: 23.3s
16:	learn: 0.2206653	total: 399ms	remaining: 23.1s
17:	learn: 0.2206307	total: 408ms	remaining: 22.3s
18:	learn: 0.2205870	total: 423ms	remaining: 21.8s
19:	learn: 0.2205072	total: 433ms	remaining

In [8]:
importance_df = pd.DataFrame({
    "Feature":train_df.columns,
    "Importance":clf.feature_importances_
}).sort_values(by="Importance", ascending=False)

importance_df

Unnamed: 0,Feature,Importance
6,cloud,29.945982
7,sunshine,14.298193
5,humidity,10.578702
0,pressure,8.76096
9,windspeed,8.505058
4,dewpoint,7.340127
8,winddirection,5.893225
3,mintemp,5.785122
1,maxtemp,4.857001
2,temparature,4.03563
