In [1]:
import sklearn
import xgboost
import pkgutil
import os
import pickle
import pandas as pd
import importlib, inspect
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

save_dir = "./models/sklearnpipeline/{0}".format(sklearn.__version__)
os.mkdir(save_dir)

FileExistsError: [Errno 17] File exists: './models/sklearnpipeline/1.2.2'

# Generate Pipeline Sample for Rocket

## Binary Classification

In [2]:
url = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'
df = pd.read_csv(url)
df.head(1)
X = df.drop("two_year_recid", axis=1) 
y = df[["two_year_recid"]] # two_year_recid is the ground truth
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # split and combine later so we can use sklearn's module
print("Training Set Samples: {0}".format(len(X_train)))
print("Test Set Samples: {0}".format(len(X_test)))

Training Set Samples: 5771
Test Set Samples: 1443


In [3]:
from sklearn.preprocessing import LabelEncoder

class featureEngineeringStage():
    def __init__(self, columns, selection):
        self.columns = columns
        self.selection = selection
    
    def transform(self, X, y=None):
        """Transform columns of X using LabelEncoder.
        """
        output = X.copy()
        for col in self.columns:
            output[col] = LabelEncoder().fit_transform(output[col])
        return output[self.selection]
    
    def fit(self, X, y=None):
        return self

In [5]:
# these things are input by data scientist during TRAINING stage.
selection = ["age", "sex", "race", "priors_count", "c_charge_degree"]
transform = ["race", "sex", "age", "c_charge_degree"]
pipe = Pipeline([
    ('feature_engineering', featureEngineeringStage(columns=transform, selection=selection)),
    ('model', LogisticRegression())
])

# Train our pipeline
pipe.fit(X, y)

# dump the test dataset for the pipeline
pickle.dump(X_test, open("./data/pickle_pandas_tabular_compas_pipeline_testing.sav".format(save_dir), "wb+"))

# dump the pipeline for this model prediction
pickle.dump(pipe, open("{0}/binary_classification_compas/binary_classification_compas_sklearn.pipeline.Pipeline.sav".format(save_dir), "wb+"))


# dump the ytest for this model prediction
pickle.dump(y_test, open("./data/pickle_pandas_tabular_compas_pipeline_ytest.sav".format(save_dir), "wb+"))

In [7]:
data = pickle.load(open("./data/pickle_pandas_tabular_compas_pipeline_testing.sav".format(save_dir), "rb"))
pipe = pickle.load(open("{0}/binary_classification_compas/binary_classification_compas_sklearn.pipeline.Pipeline.sav".format(save_dir), "rb"))
pipe.predict(X_test)

array([0, 0, 0, ..., 1, 0, 1])

## Multiclass Classification

In [18]:
training_data = pd.read_csv('./data/raw/train_loan.csv')
training_data_X = training_data.drop("Interest_Rate", axis=1)
training_data_X

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...
164304,10164305,20725,10+ years,Rent,,VERIFIED - income source,credit_card,13.56,0,,13,16,Male
164305,10164306,6000,8 years,Own,,VERIFIED - income source,small_business,9.12,0,36.0,13,30,Male
164306,10164307,20250,10+ years,Mortgage,,VERIFIED - income,credit_card,18.17,1,,17,30,Male
164307,10164308,10000,4 years,Rent,,not verified,major_purchase,18.75,1,48.0,16,62,Female


In [19]:
from sklearn.preprocessing import  LabelEncoder

class dataProcessingStage():
    def __init__(self):
        return None
    
    def labelencoding(self, df, columnname):
        le = LabelEncoder()
        df[columnname] = le.fit_transform(df[columnname])
        return df
    
    def transform(self, X):
        """Convert columns into dataframe for model input
        """
        df = X.copy()
        #df.drop(self._ground_truth_label, axis=1)
        df = df.drop("Loan_ID", axis=1)

        df["Loan_Amount_Requested"]=df["Loan_Amount_Requested"].str.replace(',','')
        df["Loan_Amount_Requested"] = df["Loan_Amount_Requested"].astype(int)

        df = self.labelencoding(df, "Length_Employed")
        df = self.labelencoding(df ,"Home_Owner")
        df = self.labelencoding(df, "Income_Verified")
        df = self.labelencoding(df, "Purpose_Of_Loan")
        df = self.labelencoding(df, "Gender")

        df = df.fillna(0)
        return df
    
    def fit(self, X, y=None):
        return self

def labelencoding2(df, columnname):
    le = LabelEncoder()
    df[columnname] = le.fit_transform(df[columnname])
    return df

In [20]:
y = training_data[["Interest_Rate"]]
y = labelencoding2(y, "Interest_Rate")

pipe = Pipeline([
    ('feature_engineering', dataProcessingStage()),
    ('model', LogisticRegression())
])

pipe.fit(training_data_X, y)

In [22]:
test_data = pd.read_csv('./data/raw/test_loan.csv')
#Merging test data provided with synthetic ground truth
final_test_data = pd.concat((test_data,training_data["Interest_Rate"][:109541]), axis = 1)
final_test_data
preds = pipe.predict(final_test_data.drop("Interest_Rate",axis=1))
print(preds)

[1 2 1 ... 1 1 1]


In [23]:
pickle.dump(final_test_data.drop("Interest_Rate", axis=1), open("./data/pickle_pandas_tabular_loan_pipeline_testing.sav", "wb+"))
pickle.dump(final_test_data[["Interest_Rate"]], open("./data/pickle_pandas_tabular_loan_pipeline_testing_ytest.sav", "wb+"))

pickle.dump(pipe, open("{0}/multiclass_classification_loan/multiclass_classification_loan_sklearn.pipeline.Pipeline.sav".format(save_dir), "wb+"))

## Regression

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv("./data/raw/insurance.csv")
df.head(1)

X = df.drop("expenses", axis=1) 
y = df[["expenses"]] # two_year_recid is the ground truth
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # split and combine later so we can use sklearn's module
print("Training Set Samples: {0}".format(len(X_train)))
print("Test Set Samples: {0}".format(len(X_test)))
df.isnull().values.any()

Training Set Samples: 1070
Test Set Samples: 268


False

In [25]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [26]:
y_test

Unnamed: 0,expenses
714,2457.50
1263,7337.75
604,17468.98
334,19144.58
1330,12629.17
...,...
1202,2055.32
253,4260.74
411,19594.81
1265,26926.51


In [11]:
class featureEngineeringStageRegression():
    def __init__(self, columns, selection):
        self.columns = columns
        self.selection = selection
    
    def transform(self, X, y=None):
        """Transform columns of X using LabelEncoder.
        """
        output = X.copy()
        for col in self.columns:
            output[col] = LabelEncoder().fit_transform(output[col])
        return output[self.selection]
    
    def fit(self, X, y=None):
        return self

In [12]:
# these things are input by data scientist during TRAINING stage.
selection = ["age", "sex", "bmi", "children", "smoker", "region"]
transform = ["sex", "smoker", "region"]
pipe = Pipeline([
    ('feature_engineering', featureEngineeringStageRegression(columns=transform, selection=selection)),
    ('model', LinearRegression())
])

# Train our pipeline
pipe.fit(X, y)
predictions = pipe.predict(X_test)

In [13]:
def _compute_error_by_group(group, ground_truth, pred_name):
        """
        This method computes error rates based on each groups.

        Args:
            group (pd.DataFrame):
            ground_truth (str): Name of ground truth header
            pred_name (str):

        Returns:
            pd.Series:
        """

        mae = mean_absolute_error(group[ground_truth], group[pred_name])
        return pd.Series(dict(mae=mae))

X_test["pred"] = predictions
X_test["charges"] = y_test

X_test.groupby("sex").apply(
    _compute_error_by_group,
    ground_truth="charges",
    pred_name="pred").reset_index()

Unnamed: 0,sex,mae
0,female,3451.077043
1,male,4214.880276


In [14]:
# dump the test dataset for the pipeline
pickle.dump(X_test, open("./data/pickle_pandas_tabular_insurance_pipeline_testing.sav".format(save_dir), "wb+"))

# dump the pipeline for this model prediction
pickle.dump(pipe, open("{0}/regression_insurance/regression_insurance_sklearn.pipeline.Pipeline.sav".format(save_dir), "wb+"))

# dump the ground truth for this model
y_test["charges"] = y_test["expenses"] 
y_test = y_test.drop("expenses", axis=1)

# dump the pipeline for this model prediction
pickle.dump(y_test, open("./data/pickle_pandas_tabular_insurance_pipeline_ytest.sav".format(save_dir), "wb+"))

In [15]:
data2 = pickle.load(open("./data/pickle_pandas_tabular_insurance_pipeline_ytest.sav".format(save_dir), "rb"))
data2

Unnamed: 0,charges
528,8342.91
1070,39871.70
55,47496.49
426,6555.07
59,5989.52
...,...
285,7742.11
858,18218.16
702,9504.31
497,8027.97


In [29]:
data1 = pickle.load(open("./data/pickle_pandas_tabular_insurance_pipeline_testing.sav".format(save_dir), "rb"))
data1

Unnamed: 0,age,sex,bmi,children,smoker,region
445,45,female,33.1,0,no,southwest
901,60,male,40.9,0,yes,southeast
408,38,male,21.1,3,no,southeast
928,62,female,39.2,0,no,southeast
290,28,female,33.4,0,no,southwest
...,...,...,...,...,...,...
217,27,male,23.1,0,no,southeast
546,28,male,35.4,0,no,northeast
104,34,female,27.5,1,no,southwest
1275,57,male,23.7,0,no,southwest
