In [1]:
import pandas as pd
import numpy as np
import joblib
#import seaborn as sns

# Preprocessing

from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector
from sklearn.model_selection import train_test_split
#from sklearn import set_config ; set_config(display = 'diagram')
from imblearn.over_sampling import SMOTE


# Model
from imblearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

from ml_api.data import storage_upload, get_local_data, drop_features, split_data, get_data_from_gcp



In [3]:
class Trainer(object):
    def __init__(self, X, y):
        self.pipeline = None
        self.X = X
        self.y = y

    def set_pipeline(self):
        # Defining columns to Imput and Scale
        impute_col = ['avg_payment_span_0_3m','num_active_div_by_paid_inv_0_12m','num_arch_written_off_12_24m', \
                'num_arch_written_off_0_12m','account_days_in_dc_12_24m','account_days_in_rem_12_24m','account_days_in_term_12_24m', \
               'sum_capital_paid_account_12_24m','sum_capital_paid_account_0_12m','recovery_debt']
        scale_col = ['sum_paid_inv_0_12m', 'time_hours', 'max_paid_inv_0_12m' ]

        # Imputing, scaling and Encoding data. Dropping the remaining columns ('merchant_category'  and 'name_in_email')


        preproc = make_column_transformer(
            (OneHotEncoder(handle_unknown='ignore'),
                ['merchant_group']
            ),
            (OrdinalEncoder(),
                ['has_paid']
             ),
            (SimpleImputer(strategy="median"),
                impute_col
             ),
            (RobustScaler(),
                scale_col
             ),
            (MinMaxScaler(),
                make_column_selector(dtype_include=['int64'])
                ),
                remainder='drop'
            )


        self.pipeline = make_pipeline (preproc ,SMOTE(), RandomForestClassifier(n_estimators=100))
        self.pipeline.fit(self.X, self.y)



    def run(self):
        self.set_pipeline()
        self.pipeline.fit(self.X, self.y)
        
        return self.pipeline

    def predict(self, X_pred):
        """Trains the model with RandomForestClassifier using the preprocessing pipeline.

        """
        self.set_pipeline()
        y_pred = self.pipeline.predict(X_pred)

        return y_pred

    def save_model(self, model):
        """Saves the model and the uploads it to the cloud.

        Args:
            model (joblib): Trained model.
        """
        joblib.dump(model, 'model.joblib')
        print("model.joblib saved locally")
        storage_upload(rm=False)



if __name__ == "__main__":
    #df = get_local_data() # gets data locally


    df = get_data_from_gcp() # gets data from the cloud
    df = drop_features(df) # drop column highly correlated and with null values
    train_df,predict_df = split_data(df) #split data frame in two : one to train the model, the other to make the prediction

    trainer = Trainer(train_df.drop('default', axis = 1), train_df['default'])
    model = trainer.run()
    y_pred = trainer.predict(predict_df.drop('default', axis = 1))
    print(y_pred)
    trainer.save_model(model)


  return df.drop(null_list + corr_list , 1 )


[0. 0. 0. ... 0. 0. 0.]
model.joblib saved locally
=> model.joblib uploaded to bucket ml_api_lau inside models/ml_training/v1/model.joblib


In [4]:
model = trainer.run()
model

In [5]:
import pandas as pd
import os
import joblib
from google.cloud import storage

LOCAL_PATH='raw_data/dataset.csv'
GCP_PATH = 'data/dataset.csv'
BUCKET_NAME='ml_api_lau'

# model folder name (will contain the folders for all trained model versions)
MODEL_NAME = 'ml_training'

# model version folder name (where the trained model.joblib file will be stored)
MODEL_VERSION = 'v1'

def get_model_from_gcp():
    """Function to get the trained model from the cloud

    Returns:
        joblib: Trained model
    """
    client = storage.Client().bucket(BUCKET_NAME)

    local_model_name = 'model.joblib'
    model_storage_location = f"models/{MODEL_NAME}/{MODEL_VERSION}/{local_model_name}"
    blob = client.blob(model_storage_location)
    blob.download_to_filename('model.joblib')
    return joblib.load('model.joblib')

In [213]:
model3 = get_model_from_gcp()

In [214]:
model3

In [10]:
y_pred = model3.predict_proba(predict_df.drop('default', axis = 1))

In [14]:
y_pred[:,0]

array([1.  , 1.  , 0.99, ..., 0.78, 1.  , 1.  ])

In [None]:
results = pd.concat([pd.Series(predict_df.index), pd.Series(y_pred[:,0], name="pd")], axis=1).set_index('uuid')

In [None]:
uuid_list = ['6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7']

In [None]:
[{'uuid' : id , 'pd' : results.loc[id]['pd']} for id in uuid_list ] 

[{'uuid': '6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7', 'pd': 1.0}]

In [None]:
{"prediction": [{'uuid' : id , 'pd' : results.loc[id]['pd']} for id in uuid_list ] }

{'prediction': [{'uuid': '6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7', 'pd': 1.0}]}

In [None]:
predict(uuid_list)

  return df.drop(null_list + corr_list , 1 )


{'prediction': [{'uuid': '6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7', 'pd': 1.0}]}

In [None]:
results

Unnamed: 0_level_0,pd
uuid,Unnamed: 1_level_1
6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7,1.00
f6f6d9f3-ef2b-4329-a388-c6a687f27e70,1.00
e9c39869-1bc5-4375-b627-a2df70b445ea,0.99
6beb88a3-9641-4381-beb6-c9a208664dd0,1.00
bb89b735-72fe-42a4-ba06-d63be0f4ca36,0.98
...,...
5c03bc63-ea65-4ffd-aa7b-95ea9a46db34,1.00
f8db22f4-9819-420c-abbc-9ddf1843176e,1.00
b22e21ea-b1b2-4df3-b236-0ff6d5fdc0d8,0.78
bafcab15-9898-479c-b729-c9dda7edb78f,1.00


In [None]:
dic = {"6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7":8.0,"f6f6d9f3-ef2b-4329-a388-c6a687f27e70":1.0}

In [None]:
dic["6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7"]

8.0

In [None]:
model3.predict_proba(predict_df.drop('default',1).loc[["b22e21ea-b1b2-4df3-b236-0ff6d5fdc0d8"]])[0][0]

  model3.predict_proba(predict_df.drop('default',1).loc[["b22e21ea-b1b2-4df3-b236-0ff6d5fdc0d8"]])[0][0]


0.78

In [None]:
uuid_list = ['6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d78', "b22e21ea-b1b2-4df3-b236-0ff6d5fdc0dy8"]

In [None]:
'6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d78' in predict_df.index

False

In [None]:
{'prediction' : [{'uuid' : id, 'pd' : model3.predict_proba(predict_df.drop('default',1).loc[[id]])[0][0]} for id in uuid_list if id in predict_df.index ]}

{'prediction': []}

In [None]:
list('6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7'.split())

['6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7']

In [229]:
url = 'http://localhost:8000/predict'

In [230]:
import requests
params = {
            'uuid_list': 'bb89b735-72fe-42a4-ba06-d63be0f4ca36'
        }
response = requests.get(url, params)

In [231]:
print (response.url)

http://localhost:8000/predict?uuid_list=bb89b735-72fe-42a4-ba06-d63be0f4ca36


In [232]:
response

<Response [500]>

In [210]:
uuid_list = response.json()

JSONDecodeError: [Errno Expecting value] Internal Server Error: 0

In [206]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/laurabonnet/Documents/GITHUBK/main-cyclist-337816-8df14917206d.json"

In [189]:
uuid_list

{'greeting': 'this is not a landing page'}

In [None]:
results = {'prediction' : [{'uuid' : id, 'pd' : model.predict_proba(predict_df.drop('default',1).loc[[id]])[0][0]} for id in uuid_list if id in predict_df.index  ]}


  results = {'prediction' : [{'uuid' : id, 'pd' : model.predict_proba(predict_df.drop('default',1).loc[[id]])[0][0]} for id in uuid_list if id in predict_df.index  ]}
  results = {'prediction' : [{'uuid' : id, 'pd' : model.predict_proba(predict_df.drop('default',1).loc[[id]])[0][0]} for id in uuid_list if id in predict_df.index  ]}


In [None]:
results

{'prediction': [{'uuid': 'ac88f18c-96a6-49bc-9e9d-a780225914af', 'pd': 1.0},
  {'uuid': 'bb89b735-72fe-42a4-ba06-d63be0f4ca36', 'pd': 0.98}]}

Unnamed: 0_level_0,default,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,age,avg_payment_span_0_3m,merchant_category,merchant_group,has_paid,max_paid_inv_0_12m,...,num_arch_written_off_12_24m,num_unpaid_bills,status_2nd_last_archived_0_24m,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7,,0.0,0.0,0.0,20,5.25,Youthful Shoes & Clothing,Clothing & Shoes,True,7225.0,...,0.0,1,1,1,1,0,8815,0,27157,19.895556
f6f6d9f3-ef2b-4329-a388-c6a687f27e70,,0.0,0.0,0.0,64,,Personal care & Body improvement,Health & Beauty,False,0.0,...,,0,0,0,0,0,0,0,0,0.236667
e9c39869-1bc5-4375-b627-a2df70b445ea,,0.0,77.0,0.0,28,,Diversified entertainment,Entertainment,True,91980.0,...,0.0,0,1,2,1,0,36163,39846,93760,20.332778
6beb88a3-9641-4381-beb6-c9a208664dd0,,0.0,0.0,0.0,31,,Concept stores & Miscellaneous,"Leisure, Sport & Hobby",True,1790.0,...,0.0,0,0,0,2,0,62585,0,1790,6.201111
bb89b735-72fe-42a4-ba06-d63be0f4ca36,,0.0,0.0,0.0,30,,Diversified electronics,Electronics,True,0.0,...,,0,0,0,0,0,14295,0,0,8.451111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5c03bc63-ea65-4ffd-aa7b-95ea9a46db34,,0.0,0.0,0.0,33,,Electronic equipment & Related accessories,Electronics,True,35195.0,...,0.0,0,1,1,1,0,0,0,60127,10.765556
f8db22f4-9819-420c-abbc-9ddf1843176e,,0.0,0.0,0.0,44,,Body & Hair Care,Health & Beauty,True,4740.0,...,0.0,1,1,1,0,0,7948,0,4740,21.708333
b22e21ea-b1b2-4df3-b236-0ff6d5fdc0d8,,0.0,20.0,0.0,24,,Jewelry & Watches,Jewelry & Accessories,True,1200.0,...,,18,0,0,0,0,17447,19627,3100,2.185278
bafcab15-9898-479c-b729-c9dda7edb78f,,0.0,0.0,0.0,31,,Decoration & Art,Home & Garden,True,15000.0,...,0.0,1,1,1,1,0,18339,56180,34785,9.725278


In [211]:
def implicit():
    from google.cloud import storage

    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

In [212]:
implicit()

[<Bucket: eu.artifacts.main-cyclist-337816.appspot.com>, <Bucket: ml_api_lau>, <Bucket: wagon-data-804-bonnet>]


In [215]:
import json

from google.cloud import storage
from google.oauth2 import service_account

project_id = 'main-cyclist-337816'

with open("/Users/laurabonnet/Documents/GITHUBK/main-cyclist-337816-8df14917206d.json") as source:
    info = json.load(source)

storage_credentials = service_account.Credentials.from_service_account_info(info)

storage_client = storage.Client(project=project_id, credentials=storage_credentials)
buckets = list(storage_client.list_buckets())
print(buckets)


[<Bucket: eu.artifacts.main-cyclist-337816.appspot.com>, <Bucket: ml_api_lau>, <Bucket: wagon-data-804-bonnet>]


In [227]:
with open("/Users/laurabonnet/Documents/GITHUBK/main-cyclist-337816-8df14917206d.json") as source:
    info = json.load(source)

In [228]:
info

{'type': 'service_account',
 'project_id': 'main-cyclist-337816',
 'private_key_id': '8df14917206db5381d61a9378d3fc81abc0b5b2b',
 'private_key': '-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCtvzcXY6jFnA/0\nZ7nPS8OP1JNnx0yqNg82DLMOXCXuKphUNfhFR5sxElEp5ZOr39zaGGR3MPmlJ8r3\nqfc+1Tp4ZV+KaO8mRS46ys1WpBh9xUzR6LDYa5UaJ6nuW+6X1sMGAnwBDSLvXBWY\nMihcICPRdYUbWiCQZ5tWKl6zfOQvyHC6RlA1RZ0Yx0YiBPWUhpxAl23K+NOSs866\nxe2wSRh6Cn6UHTrEpzSfQoDoyZ/xEWcKvNo8rnpYrFyIB/L9uy/GydvC5cSzaL9n\nqXRZqfo5wVgC+yI7/rRfp5GWrrWEBhnI/t88Dmp1ECpB7fKsKEXbVV2MibVoeEdf\nRz3dqXVJAgMBAAECggEAE1nq9aKuqzW2zDKaThWC5jvXoBUhO4ryVkODVNvcCKL7\nXNO2/2ZaiVRgMFGmyRuRkkPpuqG8iFG+iCeIx1mB3eNyiOyniCRn4fnw9U+o/6cQ\nJ4fqXGxdCrD0uvzHvecYSQzRSV+wbqbbq9wYRSx6OMvFQ7bdzh+P/ECTVBvaoVyb\ned4X51uBucL86unK1tk57ueorjJ4G+irQD5bUDw1imceLDMfMchNPGQxe4AisPkh\niUPL6gv+0Z+YLrpd38x45SxwPvdXUFFoFvYTTlYxL8KYjEfzApsUdQs3+sitLDFq\nQuabAdpR9ZEMhdNxGWCR1cyM6DhVsffPWhnhf9f7tQKBgQDaNkQkJejymlKdrqUW\nk6It0WILT27ktKhgztFi60lp3qEUE0vlD