# Predicting rain from class labels

## Imports

In [None]:
import pandas as pd
import numpy as np
import altair as alt
import math

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

## Data set up

In [None]:
SEED = 2660280232880537243 % 2**32

In [None]:
all_train_data = pd.read_csv('../data/predictions-fine-train.csv')
all_test_data = pd.read_csv('../data/predictions-fine-test.csv')

In [None]:
features = list(all_train_data.columns[3:-1])
precip = 'precipitation[mm]'
label = 'rain_binary'

In [None]:
features

In [None]:
all_train_data[label] = (all_train_data[precip] > 0).astype(int)
all_test_data[label] = (all_test_data[precip] > 0).astype(int)

In [None]:
all_train_data.head()

In [None]:
X_train = all_train_data[features].to_numpy()
y_train = all_train_data[label].to_numpy()

In [None]:
X_test = all_test_data[features].to_numpy()
y_test = all_test_data[label].to_numpy()

## Metrics

In [None]:
def get_metrics(y_true, y_pred, show=True):
    f1 = f1_score(y_true, y_pred, average='binary')
    acc = accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    if show:
        print(f'F1 Score: {f1:0.3f}')
        print(f'Accuracy: {acc:0.2%}')
        print(f'TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}')
    
    return {
        'F1': f1,
        'Accuracy': acc,
        'Confusion Matrix': [tn, fp, fn, tp],
    }

## Random forest

### Best parameters:

```
{'max_depth': 64, 'min_samples_split': 8, 'n_estimators': 300}
```

### Test results:
F1 Score: 0.736

Accuracy: 75.09%

TN: 4592, FP: 1140, FN: 1700, TP: 3968

In [None]:
param_grid = {
    'max_depth': [2, 4, 8, 16, 32, 64],
    'n_estimators': [50, 100, 200, 300, 400],
    'min_samples_split': [2, 4, 8, 16, 32],
}

rf = RandomForestClassifier()
clf_rf = GridSearchCV(rf, param_grid, cv=5, verbose=2)
clf_rf.fit(X_train, y_train)
clf_rf.best_params_

In [None]:
y_pred_rf = clf_rf.predict(X_test)

In [None]:
_ = get_metrics(y_test, y_pred_rf)

## SVM

### Best parameters:
```
{'C': 500.0, 'gamma': 'scale', 'kernel': 'rbf'}
```

### Test results:
F1 Score: 0.713

Accuracy: 73.11%

TN: 4522, FP: 1210, FN: 1855, TP: 3813

In [None]:
param_grid = {
    'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'poly', 'sigmoid']
}
svm = SVC()
clf_svm = GridSearchCV(svm, param_grid, cv=5, verbose=2)
clf_svm.fit(X_train, y_train)
print(clf_svm.best_params_)

In [None]:
y_pred_svm = clf_svm.predict(X_test)
_ = get_metrics(y_test, y_pred_svm)

## KNN Classifier

### Best parameters:
```
{'algorithm': 'auto', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
```

### Test results:

F1 Score: 0.671

Accuracy: 68.52%

TN: 4146, FP: 1586, FN: 2003, TP: 3665


In [None]:
param_grid = {
    'n_neighbors': [1, 5, 10, 50, 100, 500, 1000],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2],
}
knn = KNeighborsClassifier()
clf_knn = GridSearchCV(knn, param_grid, cv=5, verbose=2)
clf_knn.fit(X_train, y_train)
print(clf_knn.best_params_)

In [None]:
y_pred_knn = clf_knn.predict(X_test)
_ = get_metrics(y_test, y_pred_knn)

## Neural Network

### Best parameters:
```
{
    'activation': 'relu',
    'alpha': 0.0001,
    'hidden_layer_sizes': (32, 64),
    'learning_rate_init': 0.01,
    'solver': 'adam'
}
```

### Test results:

F1 Score: 0.701

Accuracy: 73.99%

TN: 4953, FP: 779, FN: 2186, TP: 3482

In [None]:
param_grid = [
    {
        'hidden_layer_sizes': [(32,), (16, 16), (32, 64), (16, 16, 16)],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate_init': [0.0001, 0.001, 0.01]
    },
]
mpl = MLPClassifier()
clf_mlp = GridSearchCV(mpl, param_grid, cv=5, verbose=2)
clf_mlp.fit(X_train, y_train)
print(clf_mlp.best_params_)

In [None]:
y_pred_mlp = clf_mlp.predict(X_test)

In [None]:
_ = get_metrics(y_test, y_pred_mlp)