# Rain Prediction with Feature Scaling

---



# Importing the libraries

In [71]:
# !pip install xgboost

In [72]:
# !pip install catboost

In [73]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Data preprocessing

In [13]:
dataset = pd.read_csv("../weatherAUS.csv")

In [14]:
pd.set_option("display.max_columns", None) # shows all the columns
dataset

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,N,13.0,7.0,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


## Counting different features

In [15]:
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'object']
continuous_features = [feature for feature in numerical_features if len(dataset[feature].unique()) >= 25]
categorical_features = [feature for feature in dataset.columns if feature not in numerical_features]

In [16]:
print("Numerical features = ", len(numerical_features))
print(numerical_features)
print("Continuous features = ", len(continuous_features))
print(continuous_features)
print("\nCategorical features = ", len(categorical_features))
print(categorical_features)

Numerical features =  16
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
Continuous features =  14
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']

Categorical features =  7
['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


## Encoding categorical features

In [17]:
windgustdir = {'NNW':0, 'NW':1, 'WNW':2, 'N':3, 'W':4, 'WSW':5, 'NNE':6, 'S':7, 'SSW':8, 'SW':9, 'SSE':10,
               'NE':11, 'SE':12, 'ESE':13, 'ENE':14, 'E':15}
winddir9am = {'NNW':0, 'N':1, 'NW':2, 'NNE':3, 'WNW':4, 'W':5, 'WSW':6, 'SW':7, 'SSW':8, 'NE':9, 'S':10,
              'SSE':11, 'ENE':12, 'SE':13, 'ESE':14, 'E':15}
winddir3pm = {'NW':0, 'NNW':1, 'N':2, 'WNW':3, 'W':4, 'NNE':5, 'WSW':6, 'SSW':7, 'S':8, 'SW':9, 'SE':10,
               'NE':11, 'SSE':12, 'ENE':13, 'E':14, 'ESE':15}

encoder = OrdinalEncoder()

dataset["WindGustDir"] = encoder.fit_transform(dataset[["WindGustDir"]].replace(windgustdir))
dataset["WindDir9am"] = encoder.fit_transform(dataset[["WindDir9am"]].replace(winddir9am))
dataset["WindDir3pm"] = encoder.fit_transform(dataset[["WindDir3pm"]].replace(winddir3pm))

In [18]:
dataset["RainToday"] = pd.get_dummies(dataset["RainToday"], drop_first = True)
dataset["RainTomorrow"] = pd.get_dummies(dataset["RainTomorrow"], drop_first = True)

In [19]:
encoder = LabelEncoder()
dataset['Location']= encoder.fit_transform(dataset['Location'])

In [20]:
dataset

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,2,13.4,22.9,0.6,,,4.0,44.0,5.0,3.0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,False,False
1,2008-12-02,2,7.4,25.1,0.0,,,2.0,44.0,0.0,6.0,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,False,False
2,2008-12-03,2,12.9,25.7,0.0,,,5.0,46.0,5.0,6.0,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,False,False
3,2008-12-04,2,9.2,28.0,0.0,,,11.0,24.0,13.0,14.0,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,False,False
4,2008-12-05,2,17.5,32.3,1.0,,,4.0,41.0,12.0,0.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,41,2.8,23.4,0.0,,,15.0,31.0,13.0,13.0,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,False,False
145456,2017-06-22,41,3.6,25.3,0.0,,,0.0,22.0,13.0,2.0,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,False,False
145457,2017-06-23,41,5.4,26.9,0.0,,,3.0,37.0,13.0,3.0,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,False,False
145458,2017-06-24,41,7.8,27.0,0.0,,,12.0,28.0,11.0,2.0,13.0,7.0,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,False,False


## Handling missing values

In [21]:
dataset.isnull().sum()*100/len(dataset) # getting missing values in percentages

Date              0.000000
Location          0.000000
MinTemp           1.020899
MaxTemp           0.866905
Rainfall          2.241853
Evaporation      43.166506
Sunshine         48.009762
WindGustDir       7.098859
WindGustSpeed     7.055548
WindDir9am        7.263853
WindDir3pm        2.906641
WindSpeed9am      1.214767
WindSpeed3pm      2.105046
Humidity9am       1.824557
Humidity3pm       3.098446
Pressure9am      10.356799
Pressure3pm      10.331363
Cloud9am         38.421559
Cloud3pm         40.807095
Temp9am           1.214767
Temp3pm           2.481094
RainToday         0.000000
RainTomorrow      0.000000
dtype: float64

handling missing values for columns cloud9am, cloud3pm, evaporation, sunshine because they have the most missing values


In [22]:
def random_sample_imputation(df, column):
    df[column] = df[column]
    rand_sample = df[column].dropna().sample(df[column].isnull().sum(), random_state=0)
    rand_sample.index = df[df[column].isnull()].index
    df.loc[df[column].isnull(), column] = rand_sample

In [23]:
for col in ['Evaporation', 'Sunshine', 'Cloud3pm', 'Cloud9am']:
  random_sample_imputation(dataset, col)

In [24]:
dataset.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation          0
Sunshine             0
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am             0
Cloud3pm             0
Temp9am           1767
Temp3pm           3609
RainToday            0
RainTomorrow         0
dtype: int64

In [25]:
imputer = SimpleImputer(strategy='most_frequent')

dataset["WindGustDir"] = imputer.fit_transform(dataset[["WindGustDir"]])
dataset["WindDir9am"] = imputer.fit_transform(dataset[["WindDir9am"]])
dataset["WindDir3pm"] = imputer.fit_transform(dataset[["WindDir3pm"]])

# le = LabelEncoder()

# df['RainToday'] = le.fit_transform(df['RainToday'])
# df['RainTomorrow'] = le.fit_transform(df['RainTomorrow'])

In [26]:
dataset.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation          0
Sunshine             0
WindGustDir          0
WindGustSpeed    10263
WindDir9am           0
WindDir3pm           0
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am             0
Cloud3pm             0
Temp9am           1767
Temp3pm           3609
RainToday            0
RainTomorrow         0
dtype: int64

Handling missing values of continuous features

In [27]:
for feature in continuous_features:
    if (dataset[feature].isnull().sum() * 100/len(dataset)) > 0:
        dataset[feature] = dataset[feature].fillna(dataset[feature].median())

In [28]:
dataset.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

## Coverting date column into pandas date

In [29]:
# dataset["Date"] = pd.to_datetime(dataset["Date"], format = "%Y-%m-%dT", errors = "coerce")
dataset['Date'] = pd.to_datetime(dataset['Date'])

In [30]:
dataset["Date_month"] = dataset["Date"].dt.month
dataset["Date_day"] = dataset["Date"].dt.day

In [31]:
dataset.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Date_month,Date_day
0,2008-12-01,2,13.4,22.9,0.6,2.4,8.3,4.0,44.0,5.0,3.0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,0.0,16.9,21.8,False,False,12,1
1,2008-12-02,2,7.4,25.1,0.0,3.6,10.0,2.0,44.0,0.0,6.0,4.0,22.0,44.0,25.0,1010.6,1007.8,7.0,1.0,17.2,24.3,False,False,12,2
2,2008-12-03,2,12.9,25.7,0.0,2.6,4.4,5.0,46.0,5.0,6.0,19.0,26.0,38.0,30.0,1007.6,1008.7,8.0,2.0,21.0,23.2,False,False,12,3
3,2008-12-04,2,9.2,28.0,0.0,18.4,8.9,11.0,24.0,13.0,14.0,11.0,9.0,45.0,16.0,1017.6,1012.8,0.0,5.0,18.1,26.5,False,False,12,4
4,2008-12-05,2,17.5,32.3,1.0,5.4,3.0,4.0,41.0,12.0,0.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,False,False,12,5


## Splitting the dataset into X and y

In [32]:
X = dataset.drop(["RainTomorrow", "Date"], axis=1)
y = dataset["RainTomorrow"]

## Applying feature scaling

In [33]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Splitting the dataset into training and test set

In [34]:
# stratify helps us to ensure that class proportions in 'y' are maintained in both training and test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, stratify = y, random_state = 0)

In [36]:
print(f'Shape of training and test sets:\nX_train = {X_train.shape}\nX_test = {X_test.shape}\ny_train = {y_train.shape}\ny_test = {y_test.shape}')

Shape of training and test sets:
X_train = (116368, 23)
X_test = (29092, 23)
y_train = (116368,)
y_test = (29092,)


## Applying SMOTE to address class imbalance

In [37]:
sm = SMOTE(random_state = 0)
X_train_, y_train_ = sm.fit_resample(X_train, y_train)
print("Number of classes before fit {}".format(Counter(y_train)))
print("Number of classes after fit {}".format(Counter(y_train_)))

Number of classes before fit Counter({False: 90866, True: 25502})
Number of classes after fit Counter({False: 90866, True: 90866})


# Fitting different models

## Logistic Regression

In [38]:
classifier_lr = LogisticRegression(random_state = 0)
classifier_lr.fit(X_train_, y_train_)

In [39]:
y_pred1 = classifier_lr.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred1))

accuracy1 = accuracy_score(y_test, y_pred1)
print(f'\nAccuracy = {accuracy1 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred1))

Confusion Matrix:
[[17893  4824]
 [ 1512  4863]]

Accuracy = 78.22%

Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.79      0.85     22717
        True       0.50      0.76      0.61      6375

    accuracy                           0.78     29092
   macro avg       0.71      0.78      0.73     29092
weighted avg       0.83      0.78      0.80     29092



## KNN Classifier

In [40]:
classifier_knn_5 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn_5.fit(X_train_, y_train_)

In [41]:
y_pred2 = classifier_knn_5.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred2))

accuracy2 = accuracy_score(y_test, y_pred2)
print(f'\nAccuracy = {accuracy2 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred2))

Confusion Matrix:
[[16710  6007]
 [ 1622  4753]]

Accuracy = 73.78%

Classification Report:
              precision    recall  f1-score   support

       False       0.91      0.74      0.81     22717
        True       0.44      0.75      0.55      6375

    accuracy                           0.74     29092
   macro avg       0.68      0.74      0.68     29092
weighted avg       0.81      0.74      0.76     29092



In [42]:
classifier_knn_4 = KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2)
classifier_knn_4.fit(X_train_, y_train_)

In [43]:
y_pred22 = classifier_knn_4.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred22))

accuracy22 = accuracy_score(y_test, y_pred22)
print(f'\nAccuracy = {accuracy22 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred22))

Confusion Matrix:
[[18303  4414]
 [ 2205  4170]]

Accuracy = 77.25%

Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.81      0.85     22717
        True       0.49      0.65      0.56      6375

    accuracy                           0.77     29092
   macro avg       0.69      0.73      0.70     29092
weighted avg       0.80      0.77      0.78     29092



In [44]:
classifier_knn_3 = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
classifier_knn_3.fit(X_train_, y_train_)

In [45]:
y_pred222 = classifier_knn_3.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred222))

accuracy222 = accuracy_score(y_test, y_pred222)
print(f'\nAccuracy = {accuracy222 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred222))

Confusion Matrix:
[[17272  5445]
 [ 1884  4491]]

Accuracy = 74.81%

Classification Report:
              precision    recall  f1-score   support

       False       0.90      0.76      0.82     22717
        True       0.45      0.70      0.55      6375

    accuracy                           0.75     29092
   macro avg       0.68      0.73      0.69     29092
weighted avg       0.80      0.75      0.76     29092



## SVM (non-linear)

In [46]:
classifier_svc_rbf = SVC(kernel = 'rbf', random_state = 0)
classifier_svc_rbf.fit(X_train_, y_train_)

In [47]:
y_pred4 = classifier_svc_rbf.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred4))

accuracy4 = accuracy_score(y_test, y_pred4)
print(f'\nAccuracy = {accuracy4 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred4))

Confusion Matrix:
[[18604  4113]
 [ 1528  4847]]

Accuracy = 80.61%

Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.82      0.87     22717
        True       0.54      0.76      0.63      6375

    accuracy                           0.81     29092
   macro avg       0.73      0.79      0.75     29092
weighted avg       0.84      0.81      0.82     29092



## Naive Bayes Classifier

In [48]:
classifier_gnb = GaussianNB()
classifier_gnb.fit(X_train_, y_train_)

In [49]:
y_pred5 = classifier_gnb.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred5))

accuracy5 = accuracy_score(y_test, y_pred5)
print(f'\nAccuracy = {accuracy5 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred5))

Confusion Matrix:
[[18266  4451]
 [ 2023  4352]]

Accuracy = 77.75%

Classification Report:
              precision    recall  f1-score   support

       False       0.90      0.80      0.85     22717
        True       0.49      0.68      0.57      6375

    accuracy                           0.78     29092
   macro avg       0.70      0.74      0.71     29092
weighted avg       0.81      0.78      0.79     29092



## Random Forest Classifier

In [50]:
classifier_rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train_, y_train_)

In [51]:
y_pred6 = classifier_rf.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred6))

accuracy6 = accuracy_score(y_test, y_pred6)
print(f'\nAccuracy = {accuracy6 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred6))

Confusion Matrix:
[[20654  2063]
 [ 2414  3961]]

Accuracy = 84.61%

Classification Report:
              precision    recall  f1-score   support

       False       0.90      0.91      0.90     22717
        True       0.66      0.62      0.64      6375

    accuracy                           0.85     29092
   macro avg       0.78      0.77      0.77     29092
weighted avg       0.84      0.85      0.84     29092



## XGBoost Classifier

In [52]:
xgboost = XGBClassifier()
xgboost.fit(X_train_, y_train_)

In [53]:
y_pred7 = xgboost.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred7))

accuracy7 = accuracy_score(y_test, y_pred7)
print(f'\nAccuracy = {accuracy7 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred7))

Confusion Matrix:
[[21403  1314]
 [ 2860  3515]]

Accuracy = 85.65%

Classification Report:
              precision    recall  f1-score   support

       False       0.88      0.94      0.91     22717
        True       0.73      0.55      0.63      6375

    accuracy                           0.86     29092
   macro avg       0.81      0.75      0.77     29092
weighted avg       0.85      0.86      0.85     29092



## Catboost Classifier

In [54]:
catboost = CatBoostClassifier(iterations = 2000, eval_metric = "AUC")
catboost.fit(X_train_, y_train_)

Learning rate set to 0.050311
0:	total: 269ms	remaining: 8m 58s
1:	total: 361ms	remaining: 6m
2:	total: 471ms	remaining: 5m 13s
3:	total: 751ms	remaining: 6m 14s
4:	total: 1.01s	remaining: 6m 45s
5:	total: 1.14s	remaining: 6m 17s
6:	total: 1.2s	remaining: 5m 42s
7:	total: 1.32s	remaining: 5m 29s
8:	total: 1.39s	remaining: 5m 6s
9:	total: 1.45s	remaining: 4m 48s
10:	total: 1.51s	remaining: 4m 33s
11:	total: 1.61s	remaining: 4m 27s
12:	total: 1.68s	remaining: 4m 17s
13:	total: 1.76s	remaining: 4m 10s
14:	total: 1.84s	remaining: 4m 3s
15:	total: 1.93s	remaining: 3m 59s
16:	total: 2s	remaining: 3m 53s
17:	total: 2.07s	remaining: 3m 48s
18:	total: 2.14s	remaining: 3m 43s
19:	total: 2.21s	remaining: 3m 38s
20:	total: 2.29s	remaining: 3m 35s
21:	total: 2.37s	remaining: 3m 33s
22:	total: 2.43s	remaining: 3m 29s
23:	total: 2.49s	remaining: 3m 25s
24:	total: 2.57s	remaining: 3m 23s
25:	total: 2.67s	remaining: 3m 22s
26:	total: 2.76s	remaining: 3m 21s
27:	total: 2.87s	remaining: 3m 22s
28:	total:

<catboost.core.CatBoostClassifier at 0x224a1ab2190>

In [55]:
y_pred_proba = catboost.predict_proba(X_test)

y_pred_proba_pos = y_pred_proba[:, 1]

threshold = 0.5

y_pred88 = (y_pred_proba_pos >= threshold).astype(int)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred88))

accuracy88 = accuracy_score(y_test, y_pred88)
print(f'\nAccuracy = {accuracy88 * 100 :.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred88))

Confusion Matrix:
[[21514  1203]
 [ 2796  3579]]

Accuracy = 86.25%

Classification Report:
              precision    recall  f1-score   support

       False       0.88      0.95      0.91     22717
        True       0.75      0.56      0.64      6375

    accuracy                           0.86     29092
   macro avg       0.82      0.75      0.78     29092
weighted avg       0.86      0.86      0.86     29092

