In [1]:
# ML 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn import model_selection 
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
TEMP = 'assets/INPUT/temp.parquet'
PRECIPTATION = 'assets/INPUT/preciptation.parquet'
CHUVOSOS = 'assets/INPUT/dias_chuvosos.parquet'
VEGETACAO = 'assets/INPUT/vegetacao.parquet'
TOPOGRAFIA = 'assets/INPUT/topografia.parquet'

In [3]:
# ROC 
def plot_roc_curve(fper, tper):
    plt.plot(fper, tper, color='red', label='ROC')
    plt.plot([0, 1], [0, 1], color='green', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()
    
# Classifier evaluation functions (modify pyimpute function)
def evaluate_clf(
    clf, X, y, name, k=None, test_size=0.2, scoring="f1_weighted", feature_names=None
):
    print(name)
    X_train, X_test, y_train, y_true = model_selection.train_test_split(
        X, y,
        test_size=test_size, # Test data size
        shuffle=True, # Shuffle the data before split
        stratify=y # Keeping the appearance/non-appearance ratio of Y
    )

    if k: # Cross-validation
        kf = model_selection.KFold(n_splits=k) # k-fold
        scores = model_selection.cross_val_score(clf, X_train, y_train, cv=kf, scoring=scoring)
        print(name + " %d-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)"
              % (k, scores.mean() * 100, scores.std() * 200))
        print()
    
    clf.fit(X_train, y_train) # Training of classifiers
    y_pred = clf.predict(X_test) # Classifier predictions
    
    # Classifier evaluation metrics
    print("Accuracy Score: %.2f" % metrics.accuracy_score(y_true, y_pred))
    print()

    print("Classification report")
    print(metrics.classification_report(y_true, y_pred))
    print()

    print("Confussion matrix")
    print(metrics.confusion_matrix(y_true, y_pred))
    print()
    
    print('AUC(ROC): %.2f'% metrics.roc_auc_score(y_true, y_pred))
    print()
       
    # ROC 
    probs = clf.predict_proba(X_test) 
    prob = probs[:, 1]  
    fper, tper, thresholds = metrics.roc_curve(y_true, prob)
    plot_roc_curve(fper, tper)

    if hasattr(clf, "feature_importances_"):
        print("Feature importances")
        for f, imp in zip(feature_names, clf.feature_importances_):
            print("%20s: %s" % (f, round(imp * 100, 1)))
        print()
    return clf

def get_X_and_y_data(path, columns, target = "Presence"):
    df = pd.read_parquet(path)
    df = df.dropna()

    y = df.pop(target)
    X = df.copy()
    X = df[columns]
    return X, y

## Análise de correlação

In [4]:
temperature = pd.read_parquet(TEMP)

In [5]:
temperature

Unnamed: 0,BIO1,BIO2,BIO3,BIO4,BIO5,BIO6,BIO7
0,19.704166,10.858334,67.443062,171.404068,26.500000,10.4,16.100000
1,19.104166,10.558333,58.333332,246.728210,27.100000,9.0,18.100000
2,17.216667,11.433333,60.175434,286.382874,26.200001,7.2,19.000000
3,18.545834,13.658334,59.384056,351.706604,29.600000,6.6,23.000000
4,18.000000,13.016666,65.083328,226.966156,27.000000,7.0,20.000000
...,...,...,...,...,...,...,...
6689,13.616667,11.733333,50.574711,321.787170,24.700001,1.5,23.200001
6690,14.954166,9.558333,52.808472,313.488129,24.100000,6.0,18.100000
6691,17.020834,10.941667,54.983250,337.392792,26.799999,6.9,19.900000
6692,15.508333,8.416666,50.099205,305.709534,24.000000,7.2,16.799999


In [None]:
maxent = LogisticRegression(max_iter=1_000)
maxent = evaluate_clf(maxent, X, y, "MaxEnt", k=5, test_size=0.2, scoring="f1_weighted", feature_names=X.columns)