In [1]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import scikitplot as skplt

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
def extract_cloudsql(DATABASE_CLOUDSQL:str,USER_CLOUDSQL:str,PASSWORD_CLOUDSQL:str,URL:str,PORT:str): 
    con_cloudsql = psycopg2.connect(
            database=DATABASE_CLOUDSQL
            ,user=USER_CLOUDSQL
            ,password=PASSWORD_CLOUDSQL
            ,host=URL
            ,port=PORT)
    sql = (
        '''
            SELECT 
                *
            FROM iris_dataset 
        '''
        )
    df = pd.read_sql_query(sql, con_cloudsql)
    return df

def read_data(): 
    iris = datasets.load_iris()
    df_iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
    return df_iris

def feature_enginering(df:pd.DataFrame,quant:list,column:str)-> pd.DataFrame: 
    binning_values=[]
    for c in quant:
        binning_values.append(np.round(np.nanquantile(df_iris[column], c),2))
    for bin_value in binning_values:
        df_iris[column+'>'+str(bin_value)]=(df_iris[column]>float(bin_value)).astype(int)
    return df_iris

def split_dataframe(df:pd.DataFrame,test_size:float,target:str): 
    X_train, X_test, y_train, y_test = train_test_split(df.drop([target],axis=1), 
                                                    df[target], 
                                                    test_size=test_size, 
                                                    random_state=0)
    return X_train, X_test, y_train, y_test

def push_gcp(df:pd.DataFrame,bucket_name:str,folder:str,source_file_name:str,destination_blob_name:str):
    df.to_csv(source_file_name)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    logging.info('{} is pushed to gcp'.format(source_file_name))  

def fit_matrix(df:pd.DataFrame): 
    matrix = CountVectorizer(max_features=5000)
    matrix.fit_transform(df.tolist())
    return matrix

def generate_matrix_transfrom(matrix,df:pd.DataFrame): 
    array_data = matrix.transform(df.tolist()).toarray()
    return array_data

def fit_model(model:str,X_train:pd.DataFrame,y_train:pd.DataFrame):
    if model=='RandomForestClassifier': 
        clf = RandomForestClassifier(n_estimators=50,n_jobs=-1, random_state=0,criterion='gini')
    clf.fit(X_train, y_train)
    return clf 

def generate_metrics(clf,X_test:pd.DataFrame,y_test:pd.DataFrame):
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    cr = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def generate_report(y_true:pd.DataFrame,y_pred:pd.DataFrame,y_probas:pd.DataFrame,filename:str): 
    with PdfPages(filename) as export_pdf:
        skplt.metrics.plot_roc_curve(y_true, y_probas)
        export_pdf.savefig()
        plt.close()


        cmap=plt.cm.Blues
        classes=[1,2,3]
        title='Confusion matrix'
        cm= confusion_matrix(y_true, y_pred)
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, cm[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        export_pdf.savefig()
        plt.close()
        logging.info('Report is generated and saved as: {}'.format(filename))



test_size=0.25
quant=[0.2, 0.4, 0.6, 0.8, 1]
column='sepal length (cm)'
target='target'

df_iris=read_data()
df_iris=feature_enginering(df_iris,quant,column)
X_train, X_test, y_train, y_test=split_dataframe(df_iris,test_size,target)
clf=fit_model('RandomForestClassifier',X_train,y_train)
generate_metrics(clf,X_test,y_test)

y_true=y_test
y_pred=clf.predict(X_test)
y_probas=clf.predict_proba(X_test)
filename='metrics_report.pdf'
generate_report(y_true,y_pred,y_probas,filename)

[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]




In [3]:
df_iris.head(n=10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,sepal length (cm)>5.0,sepal length (cm)>5.6,sepal length (cm)>6.1,sepal length (cm)>6.52,sepal length (cm)>7.9
0,5.1,3.5,1.4,0.2,0.0,1,0,0,0,0
1,4.9,3.0,1.4,0.2,0.0,0,0,0,0,0
2,4.7,3.2,1.3,0.2,0.0,0,0,0,0,0
3,4.6,3.1,1.5,0.2,0.0,0,0,0,0,0
4,5.0,3.6,1.4,0.2,0.0,0,0,0,0,0
5,5.4,3.9,1.7,0.4,0.0,1,0,0,0,0
6,4.6,3.4,1.4,0.3,0.0,0,0,0,0,0
7,5.0,3.4,1.5,0.2,0.0,0,0,0,0,0
8,4.4,2.9,1.4,0.2,0.0,0,0,0,0,0
9,4.9,3.1,1.5,0.1,0.0,0,0,0,0,0


In [4]:
df_iris.tail(n=3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,sepal length (cm)>5.0,sepal length (cm)>5.6,sepal length (cm)>6.1,sepal length (cm)>6.52,sepal length (cm)>7.9
147,6.5,3.0,5.2,2.0,2.0,1,1,1,0,0
148,6.2,3.4,5.4,2.3,2.0,1,1,1,0,0
149,5.9,3.0,5.1,1.8,2.0,1,1,0,0,0
