In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score

In [31]:
train_df = pd.read_csv("./inputs/train.csv").set_index("id")
target = train_df["rainfall"]
train_df.drop("day", axis=1, inplace=True)
train_df.drop("rainfall", axis=1, inplace=True)
train_df.head()

Unnamed: 0_level_0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2
1,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9
2,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1
3,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6
4,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8


In [32]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
clf = CatBoostClassifier(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=42)
clf.load_model("./outputs/catboost.model")
clf.fit(X_train_scaled, y_train, init_model=clf)
clf.save_model("./outputs/catboost.model")
y_pred = clf.predict(X_test_scaled)
score = roc_auc_score(y_test, y_pred)
print(score)

0:	learn: 0.1232060	total: 4.28ms	remaining: 4.28s
1:	learn: 0.1231925	total: 8.25ms	remaining: 4.12s
2:	learn: 0.1231860	total: 12ms	remaining: 4s
3:	learn: 0.1231499	total: 15.4ms	remaining: 3.85s
4:	learn: 0.1231206	total: 19.5ms	remaining: 3.87s
5:	learn: 0.1231002	total: 23.3ms	remaining: 3.85s
6:	learn: 0.1230581	total: 26.2ms	remaining: 3.71s
7:	learn: 0.1230018	total: 29.4ms	remaining: 3.64s
8:	learn: 0.1229841	total: 33.8ms	remaining: 3.72s
9:	learn: 0.1229753	total: 37ms	remaining: 3.66s
10:	learn: 0.1229389	total: 39.5ms	remaining: 3.55s
11:	learn: 0.1229099	total: 41.9ms	remaining: 3.45s
12:	learn: 0.1228884	total: 44.8ms	remaining: 3.4s
13:	learn: 0.1228643	total: 47.6ms	remaining: 3.35s
14:	learn: 0.1228326	total: 49.9ms	remaining: 3.28s
15:	learn: 0.1227925	total: 53.1ms	remaining: 3.27s
16:	learn: 0.1227552	total: 55.7ms	remaining: 3.22s
17:	learn: 0.1227352	total: 58.1ms	remaining: 3.17s
18:	learn: 0.1226997	total: 60.5ms	remaining: 3.12s
19:	learn: 0.1226690	total: 63

In [35]:
importance_df = pd.DataFrame({
    "Feature":train_df.columns,
    "Importance":clf.feature_importances_
}).sort_values(by="Importance", ascending=False)

importance_df

Unnamed: 0,Feature,Importance
6,cloud,23.99629
7,sunshine,13.473167
5,humidity,9.752917
9,windspeed,9.683757
0,pressure,9.496727
4,dewpoint,7.738979
3,mintemp,7.504243
8,winddirection,6.996134
1,maxtemp,6.037471
2,temparature,5.320316
