In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('dataset/dataset.csv')
df.head()

In [None]:
rainfall_classification = []

for i in df['rainfall']:
    if(i==0):
        rainfall_classification.append(0)
    elif(i>0 and i<2.5):
        rainfall_classification.append(1)
    elif(i>=2.5 and i<7.5):
        rainfall_classification.append(2)
    elif(i>=7.5 and i<35.5):
        rainfall_classification.append(3)
    elif(i>=35.5 and i<64.4):
        rainfall_classification.append(4)
    elif(i>=64.4 and i<124.4):
        rainfall_classification.append(5)
    else:
        rainfall_classification.append(6)

df['rainfall_classification'] = rainfall_classification
df.head(10)

In [None]:
# Extract input features and target variable
X = df[['date', 'month', 'year', 'temperature', 'specific humidity', 'relative humidity', 'surface pressure', 'wind speed', 'wind direction']]
y = df['rainfall_classification']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#y_test = y_test.tolist()

### AdaBoost

In [None]:
def AB(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    base_estimator = DecisionTreeClassifier(max_depth=1)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1],
        'base_estimator__max_depth': [1, 2, 3],
        'base_estimator__max_features': [None, 'sqrt', 'log2']
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    adaboost_clf = AdaBoostClassifier(base_estimator=base_estimator, algorithm='SAMME.R', random_state=42)
    grid_search = GridSearchCV(adaboost_clf, param_grid=param_grid, cv=cv, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best estimator from the GridSearchCV
    best_estimator = grid_search.best_estimator_

    # Make predictions on the testing data
    predictions = best_estimator.predict(X_test)

    return [str(grid_search.best_params_).replace(',','\n'), grid_search.best_score_,accuracy_score(y_test, predictions)]

print("\nAdaBoost :\n")

data = [AB(X,y)]
head = ["Parameters","CV score","Accuracy"]

print(tabulate(data, headers=head, tablefmt="grid"))

### Decision Tree

In [None]:
def DT(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    parameters = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 1, 2, 3, 4, 5],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2']
    }

    dt_classifier = DecisionTreeClassifier()
    cv_method = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(dt_classifier, parameters, cv=cv_method, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    model = DecisionTreeClassifier(**grid_search.best_params_)
    model.fit(X_train, y_train)

    # Make predictions on the training data and compute accuracy
    predictions = model.predict(X_test)

    return [str(grid_search.best_params_).replace(',','\n'), grid_search.best_score_,accuracy_score(y_test, predictions)]

print("\nDecision Tree :\n")

data = [DT(X,y)]
head = ["Parameters","CV score","Accuracy"]

print(tabulate(data, headers=head, tablefmt="grid"))

### Extra Tree

In [None]:
def ET(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    parameters = {'n_estimators': [10, 50, 100],
                   'max_features': ['auto', 'sqrt', 'log2'],
                   'max_depth': [3, 5, 7, None],
                   'min_samples_split': [2, 5, 10],
                   'min_samples_leaf': [1, 2, 4],
                   'criterion': ['gini', 'entropy']}

    lr = ExtraTreesClassifier()
    cv = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(lr, parameters, cv=cv, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best estimator from the GridSearchCV
    best_estimator = grid_search.best_estimator_

    # Make predictions on the testing data
    predictions = best_estimator.predict(X_test)

    return [str(grid_search.best_params_).replace(',','\n'), grid_search.best_score_,accuracy_score(y_test, predictions)]

print("\nExtra Tree :\n")

data = [ET(X,y)]
head = ["Parameters","CV score","Accuracy"]

print(tabulate(data, headers=head, tablefmt="grid"))

### Gaussian NB

In [None]:
def GNB(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    parameters = {'var_smoothing': np.logspace(0,-9, num=100)}

    nb = GaussianNB()
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(nb, parameters, cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best estimator from the GridSearchCV
    best_estimator = grid_search.best_estimator_

    # Make predictions on the testing data
    predictions = best_estimator.predict(X_test)

    return [str(grid_search.best_params_).replace(',','\n'), grid_search.best_score_,accuracy_score(y_test, predictions)]

print("\nGaussian NB:\n")

data = [GNB(X,y)]
head = ["Parameters","CV score","Accuracy"]

print(tabulate(data, headers=head, tablefmt="grid"))

### Gradient Boost

In [None]:
def GB(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    parameters = {'learning_rate': [0.01, 0.1, 1],
              'max_depth': [3, 5, 7],
              'n_estimators': [50, 100, 200],
              'subsample': [0.5, 0.75, 1]}

    gb = GradientBoostingClassifier()
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(gb, parameters, cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best estimator from the GridSearchCV
    best_estimator = grid_search.best_estimator_

    # Make predictions on the testing data
    predictions = best_estimator.predict(X_test)

    return [str(grid_search.best_params_).replace(',','\n'), grid_search.best_score_,accuracy_score(y_test, predictions)]

print("\nGradient Boost:\n")

data = [GB(X,y)]
head = ["Parameters","CV score","Accuracy"]

print(tabulate(data, headers=head, tablefmt="grid"))

### Logistic Regression

In [None]:
def LR(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    parameters = {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 250, 500],
        'class_weight': [None, 'balanced']
    }

    lr = LogisticRegression(random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(lr, parameters, cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best estimator from the GridSearchCV
    best_estimator = grid_search.best_estimator_

    # Make predictions on the testing data
    predictions = best_estimator.predict(X_test)

    return [str(grid_search.best_params_).replace(',','\n'), grid_search.best_score_,accuracy_score(y_test, predictions)]

print("\nLogistic Regression:\n")

data = [LR(X,y)]
head = ["Parameters","CV score","Accuracy"]

print(tabulate(data, headers=head, tablefmt="grid"))

### Random Forest

In [None]:
def RF(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    parameters = {
        "n_estimators": [100, 250, 500],
        "max_depth": [3, 5, 7],
        "max_features": ["sqrt", "log2", 0.5, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }

    rf = RandomForestClassifier()
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(rf, parameters, cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best estimator from the GridSearchCV
    best_estimator = grid_search.best_estimator_

    # Make predictions on the testing data
    predictions = best_estimator.predict(X_test)

    return [str(grid_search.best_params_).replace(',','\n'), grid_search.best_score_,accuracy_score(y_test, predictions)]

print("\nRandom Forest:\n")

data = [RF(X,y)]
head = ["Parameters","CV score","Accuracy"]

print(tabulate(data, headers=head, tablefmt="grid"))

### Prediction

In [None]:
def get_data(df,month,feature):
    df_year = df[df['year'] == 2020]
    df_month = df_year[df_year['month'] == month]
    return [df_month[feature].mean()]

date = 1
month = 8
year = 2023

pred = pd.DataFrame.from_dict({
        'date':[date], 
        'month':[month], 
        'year':[year],
        'temperature':get_data(df,month,'temperature'),
        'specific humidity':get_data(df,month,'specific humidity'),
        'relative humidity':get_data(df,month,'relative humidity'),
        'surface pressure':get_data(df,month,'surface pressure'),
        'wind speed':get_data(df,month,'wind speed'),
        'wind direction':get_data(df,month,'wind direction')})

model = RandomForestClassifier(max_depth=7,max_features=None,min_samples_leaf=1,min_samples_split=5,n_estimators=250)
model.fit(X_train, y_train)
pred = model.predict(pred)

if(pred[0]==0):
    print("Rainfall = 0 mm")
elif(pred[0]==1):
    print("Rainfall ranges from 0 to 2.5 mm")
elif(pred[0]==2):
    print("Rainfall ranges from 2.5 to 7.5 mm")
elif(pred[0]==3):
    print("Rainfall ranges from 7.5 to 35.5 mm")
elif(pred[0]==4):
    print("Rainfall ranges from 35.5 to 64.4 mm")
elif(pred[0]==5):
    print("Rainfall ranges from 64.4 to 124.4 mm")
else:
    print("Rainfall ranges above 124.4 mm")