# Predicting rain from class labels

## Imports

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import math

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

## Data set up

In [3]:
SEED = 2660280232880537243 % 2**32

In [4]:
all_data = pd.read_csv('../data/predictions-fine-train.csv')

In [5]:
features = list(all_data.columns[3:-1])
precip = 'precipitation[mm]'
label = 'rain_binary'

In [6]:
all_data[label] = (all_data[precip] > 0).astype(int)

In [7]:
all_data.head()

Unnamed: 0,node_timestamp,precipitation[mm],node,small-sounding-engine,medium-sounding-engine,large-sounding-engine,rock-drill,jackhammer,hoe-ram,pile-driver,...,reverse-beeper,stationary-music,mobile-music,ice-cream-truck,person-or-small-group-talking,person-or-small-group-shouting,large-crowd,amplified-speech,dog-barking-whining,rain_binary
0,1499439000.0,4.57,sonycnode-b827ebb40450.sonyc,0.050979,0.175963,0.445077,0.022027,0.01103,0.003211,0.01159,...,0.836669,0.014445,0.013124,0.005841,0.264278,0.378792,0.010296,0.003326,0.039958,1
1,1491510000.0,1.52,sonycnode-b827eb2a1bce.sonyc,0.076949,0.202095,0.179304,0.012556,0.007496,0.003484,0.006339,...,0.012783,0.013436,0.01079,0.004742,0.853698,0.079865,0.024335,0.006472,0.013903,1
2,1502420000.0,0.0,sonycnode-b827eb5895e9.sonyc,0.055956,0.170309,0.31506,0.01377,0.007515,0.003281,0.007144,...,0.018277,0.010777,0.012352,0.005267,0.089876,0.021077,0.01318,0.00377,0.013475,0
3,1492887000.0,0.25,sonycnode-b827eb132382.sonyc,0.045899,0.237363,0.300532,0.014068,0.01097,0.006509,0.012917,...,0.027464,0.016198,0.014329,0.005145,0.692143,0.06068,0.021127,0.005001,0.015498,1
4,1490255000.0,0.0,sonycnode-b827eb0fedda.sonyc,0.186144,0.262893,0.460561,0.042873,0.027672,0.013615,0.01605,...,0.011741,0.006938,0.007526,0.004744,0.034499,0.007029,0.007325,0.002706,0.009156,0


In [8]:
X = all_data[features].to_numpy()
y = all_data[label].to_numpy()

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

## Metrics

In [10]:
def get_metrics(y_true, y_pred, show=True):
    f1 = f1_score(y_val, y_pred, average='binary')
    acc = accuracy_score(y_val, y_pred)
    
    if show:
        print(f'F1 Score: {f1:0.3f}\nAccuracy: {acc:0.2%}')
    
    return {
        'F1': f1,
        'accuracy': acc
    }

## Random forest

In [16]:
# Results:
# {'max_depth': 32, 'min_samples_split': 4, 'n_estimators': 400}
#
# F1 Score: 0.728
# Accuracy: 74.19%
#
# param_grid = {
#     'max_depth': [2, 4, 8, 16, 32, 64],
#     'n_estimators': [50, 100, 200, 300, 400],
#     'min_samples_split': [2, 4, 8, 16, 32],
# }

# rf = RandomForestClassifier()
# clf_rf = GridSearchCV(rf, param_grid, cv=5)
# clf_rf.fit(X_train, y_train)
# clf_rf.best_params_

In [17]:
clf_rf = RandomForestClassifier(max_depth=32, min_samples_split=4, n_estimators=400)
clf_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=32, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
y_pred = clf_rf.predict(X_val)

In [19]:
_ = get_metrics(y_val, y_pred)

F1 Score: 0.727
Accuracy: 74.19%


## Logistic Regression

In [20]:
# Results: {'C': 10.0, 'tol': 1e-07}
param_grid = {
    'tol': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0],
}

logistic = linear_model.LogisticRegression()
clf_logistic = GridSearchCV(logistic, param_grid, cv=5)
clf_logistic.fit(X_train, y_train)
y_pred = clf_logistic.predict(X_val)
_ = get_metrics(y_val, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

F1 Score: 0.679
Accuracy: 67.76%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [26]:
list(zip(features, list(clf_logistic.best_estimator_.coef_[0])))

[('small-sounding-engine', 20.340056556734417),
 ('medium-sounding-engine', -1.2915218191060425),
 ('large-sounding-engine', -0.5739550820542987),
 ('rock-drill', -8.918079423213486),
 ('jackhammer', 1.1554112737839186),
 ('hoe-ram', -10.716259645338216),
 ('pile-driver', 12.487441087903834),
 ('non-machinery-impact', -0.4176778400194621),
 ('chainsaw', -14.547638571572438),
 ('small-medium-rotating-saw', 6.982692270664287),
 ('large-rotating-saw', 4.655055517485288),
 ('car-horn', 0.03751783516675841),
 ('car-alarm', -16.15495276439222),
 ('siren', 0.8058676239346603),
 ('reverse-beeper', 0.8740975096232834),
 ('stationary-music', -12.941228731898274),
 ('mobile-music', -29.411127264938113),
 ('ice-cream-truck', -3.2858524543721632),
 ('person-or-small-group-talking', -0.6353464611431304),
 ('person-or-small-group-shouting', 2.0099833202418806),
 ('large-crowd', -0.07943676581612905),
 ('amplified-speech', -1.5747762563243524)]

## SVM

In [29]:
# Results: {'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}
# param_grid = {
#     'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0],
#     'gamma': ['scale', 'auto'],
#     'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
# }
# svm = SVC()
# clf_svm = GridSearchCV(svm, param_grid, cv=5)
# clf_svm.fit(X_train, y_train)
# print(clf_svm.best_params_)

{'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}


In [30]:
clf_svm = SVC(gamma='scale', kernel='rbf', C=10)
clf_svm.fit(X_train, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
y_pred = clf_svm.predict(X_val)
_ = get_metrics(y_val, y_pred)

F1 Score: 0.683
Accuracy: 70.51%


## KNN Classifier

In [27]:
# Results: {'algorithm': 'auto', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
param_grid = {
    'n_neighbors': [1, 3, 5, 10, 15, 20, 25, 50],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2],
}
knn = KNeighborsClassifier()
clf_knn = GridSearchCV(knn, param_grid, cv=5)
clf_knn.fit(X_train, y_train)
print(clf_knn.best_params_)

{'algorithm': 'auto', 'n_neighbors': 50, 'p': 1, 'weights': 'distance'}


In [28]:
# clf_knn = KNeighborsClassifier(n_neighbors=50, p=1, weights='distance')
# clf_knn.fit(X_train, y_train)
y_pred = clf_knn.predict(X_val)
_ = get_metrics(y_val, y_pred)

F1 Score: 0.674
Accuracy: 68.72%


## Adaboost

In [None]:
# Result: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 500}
# param_grid = {
#     'n_estimators': [10, 25, 50, 75, 100, 250, 500],
#     'learning_rate': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50],
#     'algorithm': ['SAMME', 'SAMME.R'],
# }
# ada = AdaBoostClassifier()
# clf_ada = GridSearchCV(ada, param_grid, cv=5)
# clf_ada.fit(X_train, y_train)
# print(clf_ada.best_params_)

In [None]:
clf_ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
clf_ada.fit(X_train, y_train)
y_pred = clf_ada.predict(X_val)
_ = get_metrics(y_val, y_pred)

## Neural Network

In [None]:
# Result: {
#     'activation': 'relu',
#     'alpha': 0.001,
#     'hidden_layer_sizes': (32, 64),
#     'learning_rate_init': 0.001, 'solver': 'adam'
# }
# param_grid = [
#     {
#         'hidden_layer_sizes': [(16,), (64,), (16, 16), (16, 32), (32, 64)],
#         'activation': ['logistic', 'tanh', 'relu'],
#         'solver': ['adam'],
#         'alpha': [0.00001, 0.0001, 0.001],
#         'learning_rate_init': [0.0001, 0.001, 0.01]
#     },
#     {
#         'hidden_layer_sizes': [(16,), (64,), (16, 16), (16, 32), (32, 64)],
#         'activation': ['logistic', 'tanh', 'relu'],
#         'solver': ['sgd'],
#         'learning_rate': ['constant', 'invscaling', 'adaptive'],
#         'alpha': [0.00001, 0.0001, 0.001],
#         'learning_rate_init': [0.0001, 0.001, 0.01],
#     },
#     {
#         'hidden_layer_sizes': [(16,), (64,), (16, 16), (16, 32), (32, 64)],
#         'activation': ['logistic', 'tanh', 'relu'],
#         'solver': ['lbfgs'],
#         'alpha': [0.00001, 0.0001, 0.001],
#     },
# ]
# mpl = MLPClassifier()
# clf_mlp = GridSearchCV(mpl, param_grid, cv=5)
# clf_mlp.fit(X_train, y_train)
# print(clf_mlp.best_params_)

In [None]:
clf_mlp = MLPClassifier(activation='relu',
                        alpha=0.001,
                        hidden_layer_sizes=(32, 64),
                        learning_rate_init=0.001,
                        solver='adam')

clf_mlp.fit(X_train, y_train)

In [None]:
y_pred = clf_mlp.predict(X_val)

In [None]:
_ = get_metrics(y_val, y_pred)