In [17]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pickle
import json

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, r2_score, mean_squared_log_error
from pandas.plotting import scatter_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif

In [18]:
def class_feature_selection_transformation(X, y, target, ordinal_feature, route, path):
    
    def train_num_feature_selection(X_num):
        # feature selection on numerical data
        k_num = round(len(X_num.columns) / 2)
        fs = SelectKBest(f_classif, k_num)
        fs.fit(X_num, y) # save!!
        X_num_fs = fs.transform(X_num)
        return (X_num_fs, fs)
        
    def train_cat_feature_selection(X_cat, X_cat_enc):
        # feature selection on categorical data
        k_cat = round(len(X_cat.columns) / 2)
        fs = SelectKBest(chi2, k_cat)
        fs.fit(X_cat_enc, y) # save!!
        X_cat_fs = fs.transform(X_cat_enc)
        X_cat_enc = pd.DataFrame(X_cat_fs)
        return (X_cat_enc, fs)
    
    def predict_num_feature_selection(X_num):
        directory = path + '/fs_values.pkl'
        with open(directory, 'rb') as file:
            selected_num_features, dummy = pickle.load(file)[0:2]
            
        X_num_fs = selected_num_features.transform(X_num)
        X_num = pd.DataFrame(X_num_fs)
        
        return X_num
    
    def predict_cat_feature_selection(X_cat_enc):
        directory = path + '/fs_values.pkl'
        with open(directory, 'rb') as file:
            dummy, selected_cat_features = pickle.load(file)[0:2]
            
        X_cat_fs = selected_cat_features.transform(X_cat_enc)
        X_cat_enc = pd.DataFrame(X_cat_fs)
        
        return X_cat_enc
    
    # split features
    num_features = []
    cat_features = []
    X_num = pd.DataFrame()
    X_cat = pd.DataFrame()
    
    for feature in X:
        if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
            num_features.append(feature)
        else:
            cat_features.append(feature)
    
    # impute using only numerical features
    if num_features:
        X = X.reset_index(drop = True)
        imp = IterativeImputer(max_iter = 10, random_state = 42)
        imp.fit(X[num_features])
        X[num_features] = imp.transform(X[num_features])
        X_num = X.drop(cat_features, axis = 1)
    
    # impute using only categorical features
    if cat_features and not all(elem == target for elem in cat_features):
        imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
        X[cat_features] = imp.fit_transform(X[cat_features].astype(str))
        X_cat = X.drop(num_features, axis = 1)
        
    # get column count
    if not X_num.empty:
        num_shape = X_num.shape[1]
    else:
        num_shape = 0
    
    if not X_cat.empty:
        cat_shape = X_cat.shape[1]
    else:
        cat_shape = 0
    
    # drop target
    if target in X_num:
        X_num = X_num.drop(target, axis = 1)
    if target in X_cat:
        X_cat = X_cat.drop(target, axis = 1)
    
    # feature selection on num_features
    if route == '/train':
        if num_features and not num_shape <= 10:
            X_num, selected_num_features = train_num_feature_selection(X_num)
        else:
            selected_num_features = None
    if route == '/predict':
        if num_features and not num_shape <= 10:
            X_num = predict_num_feature_selection(X_num)
    
    # encode ordinal features (dummy variables)
    if ordinal_feature:
        for feature in ordinal_feature:
            ord_data = [feature]
            X_num = pd.get_dummies(X_num, columns = ord_data, drop_first = True)
    
    if cat_features:
        enc = OrdinalEncoder()
        enc.fit(X_cat)
        X_cat_enc = enc.transform(X_cat)
    
    # feature selection on cat_features
    if route == '/train':
        if cat_features and not cat_shape <= 10:
            X_cat_enc, selected_cat_features = train_cat_feature_selection(X_cat, X_cat_enc)
        else:
            selected_cat_features = None
    if route == '/predict':
        if cat_features and not cat_shape <= 10:
            X_cat_enc = predict_cat_feature_selection(X_cat_enc)
    
    # concatenate numerical and categorical features
    if num_features and cat_features:
        df_cat = pd.DataFrame(X_cat_enc, index = list(range(len(X.index))))
        df_num = pd.DataFrame(X_num, index = list(range(len(X.index))))
        X = pd.concat([df_cat, df_num], axis = 1, sort = False)
    elif cat_features:
        X = pd.DataFrame(X_cat_enc)
    elif num_features:
        X = pd.DataFrame(X_num)
    
    if route == '/train':
        # serialize feature selection values
        fs_values = [selected_num_features, selected_cat_features, ordinal_feature, target]
        directory = path + '/fs_values.pkl'
        with open(directory, 'wb') as file:
            pickle.dump(fs_values, file)
            
    return X

In [19]:
def predict_randomforestclass(X, y, target):
    if target in X:
        X = X.drop(target, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
    rf = RandomForestClassifier(n_estimators = 100, max_depth = None) 
    rf.fit(X_train, y_train)
    
    # quantifying the quality of prediction
    y_predict = rf.predict(X_test)
    acc_score = abs(accuracy_score(y_test, y_predict))
    
    model = (rf, acc_score)
    return model

In [20]:
def predict_logisticregress(X, y, target):
    if target in X:
        X = X.drop(target, axis = 1)
    X_scaled = preprocessing.scale(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state = 42)
    
    log_reg = LogisticRegression(random_state = 42) 
    log_reg.fit(X_train, y_train)
    
    # quantifying the quality of prediction
    y_predict = log_reg.predict(X_test)
    acc_score = abs(accuracy_score(y_test, y_predict))
    
    model = (log_reg, acc_score)
    return model

In [21]:
def train():
    def predict_class(feature_engineering, y, target, path):
        rf, rf_acc = predict_randomforestclass(feature_engineering, y, target)
        log_reg, log_acc = predict_logisticregress(feature_engineering, y, target)
        
        best_score = max([rf_acc, log_acc])

        # save best model on disk
        directory = path + '/model.pkl'
        if best_score == rf_acc:
            with open(directory, 'wb') as file:
                pickle.dump(rf, file)
                model = 'RandomForestClassifier'
                scoring_param = 'accuracy score'
                return (model, scoring_param, rf_acc)
        elif best_score == log_acc:
            with open(directory, 'wb') as file:
                pickle.dump(log_reg, file)
                model = 'LogisticRegression'
                scoring_param = 'accuracy score'
                return (model, scoring_param, log_acc)
    
    train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_train.csv"), index_col = [0])
    
    # select features and target variable
    target = 'Survived'
    ordinal_feature = ['Pclass', 'SibSp'] 
    route = '/train'
    path = 'model_name'
    if os.path.exists(path):
        return json.dumps({'Error': "Model already exists"})
    else:
        os.mkdir(path)
        features = list(train_data)
        X = train_data[features]
        y = train_data[target]
        
        # only for iris
        # X = X.drop('Id', axis = 1)
    
        feature_engineering = class_feature_selection_transformation(X, y, target, ordinal_feature, route, path)
        model, scoring_param, best_score = predict_class(feature_engineering, y, target, path)
        model_stats = [model, scoring_param, best_score, path]
        directory = path + '/model_stats.pkl'
        with open(directory, 'wb') as file:
            pickle.dump(model_stats, file)
        
        return json.dumps({'Message': 'Successful'})

In [22]:
train()

'{"Message": "Successful"}'

In [23]:
def predict():
    
    test_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_test.csv"), index_col = [0])
    route = '/predict'
    path = 'model_name'
    
    path = 'model_name'
    directory_fs = path + '/fs_values.pkl'
    directory_model = path + '/model.pkl'
    with open(directory_fs, 'rb') as file:
        unpickler = pickle.Unpickler(file);
        fs_values = unpickler.load();
        if len(fs_values) == 5:
            ordinal_feature, target = fs_values[3:] # regressor
        elif len(fs_values) == 4:
            ordinal_feature, target = fs_values[2:] # classifier
    with open(directory_model, 'rb') as file:
        model = pickle.load(file)
        
    # test_data = test_data.reset_index(drop = True)
    
    # nur bei iris
    # test_data = test_data.drop('Species', axis = 1)
    
    feature_engineering = class_feature_selection_transformation(test_data, None, None, ordinal_feature, route, path)
    y_predict = model.predict(feature_engineering)
    
    df_1stcolumn = pd.DataFrame(test_data.iloc[:,0])
    df_prediction = pd.DataFrame({target: y_predict})
    df_1stcolumn = df_1stcolumn.reset_index(drop = True)
    df_prediction = df_prediction.reset_index(drop = True)
    output = df_1stcolumn.merge(df_prediction, left_index = True, right_index = True)
    result = output.to_json(orient = 'records')
    parsed = json.loads(result)
    
    return json.dumps(parsed, indent = 4)

In [None]:
predict()

In [25]:
def status():
    model_name = 'model_name'
    directory = model_name + '/model_stats.pkl'
    if os.path.exists(directory):
        with open(directory, 'rb') as file:
            model, scoring_param, best_score, path = pickle.load(file)
            return json.dumps({'model': model, scoring_param :best_score, 'model name': path}, sort_keys = True, indent = 4, separators=(',', ': '))
    else:
        return json.dumps({'Message': 'Model is not available yet'})

In [27]:
status()

'{\n    "accuracy score": 0.8385650224215246,\n    "model": "RandomForestClassifier",\n    "model name": "model_name"\n}'