In [1]:
import jarvis

jarvis.groundClient('git')
jarvis.jarvisFile('JarvisParameterTuning.ipynb')

In [2]:
import sklearn.linear_model as linear_model
import sklearn
import seaborn as sns
import pandas as pd
import numpy as np

# Data Loading

Here I am using built-in data to make a quick example.  In practice I would probably want to download the data from some external source

In [3]:
@jarvis.func
def crawl():
    return sns.load_dataset('titanic')

doCrawl = jarvis.Action(crawl)
titanic_data = jarvis.Artifact('titanic.pkl', doCrawl) 

In [4]:
titanic_data.peek(lambda x: x.head())

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Data Processing

I need to extract some binary features

In [5]:
@jarvis.func
def featurize(df):
    return pd.get_dummies(df)

doFeaturize = jarvis.Action(featurize, [titanic_data])
ft_titanic_data = jarvis.Artifact('ft_titanic.pkl', doFeaturize)

In [6]:
ft_titanic_data.peek(lambda x: x.head())

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0,3,22.0,1,0,7.25,True,False,0,1,...,0,0,0,0,0,0,0,1,1,0
1,1,1,38.0,1,0,71.2833,False,False,1,0,...,1,0,0,0,0,1,0,0,0,1
2,1,3,26.0,0,0,7.925,False,True,1,0,...,0,0,0,0,0,0,0,1,0,1
3,1,1,35.0,1,0,53.1,False,False,1,0,...,1,0,0,0,0,0,0,1,0,1
4,0,3,35.0,0,0,8.05,True,True,0,1,...,0,0,0,0,0,0,0,1,1,0


# Make the training matrices

In [7]:
@jarvis.func
def separateLabels(df):
    data = df.dropna()
    Y = data['survived'].values
    X = data.drop(['survived'], axis=1).values.astype('float')
    return X, Y

doSepLabels = jarvis.Action(separateLabels, [ft_titanic_data])
X_ft_titanic_data = jarvis.Artifact('x_ft_titanic.pkl', doSepLabels)
Y_ft_titanic_data = jarvis.Artifact('y_ft_titanic.pkl', doSepLabels)

# Train Test Split

In [8]:
@jarvis.func
def trainTestSplit(X, Y, test_size, random_state):
    from sklearn.model_selection import train_test_split
    (X_tr, X_te, Y_tr, Y_te) = train_test_split(X, Y, test_size = test_size, random_state=random_state)
    return (X_tr, X_te, Y_tr, Y_te)

doTrTeSplit = jarvis.Action(trainTestSplit, [X_ft_titanic_data, Y_ft_titanic_data, jarvis.Literal(0.1), jarvis.Literal(42)])
X_tr = jarvis.Artifact('tr_x_ft_titanic.pkl', doTrTeSplit)
X_te = jarvis.Artifact('te_x_ft_titanic.pkl', doTrTeSplit)
Y_tr = jarvis.Artifact('tr_y_ft_titanic.pkl', doTrTeSplit)
Y_te = jarvis.Artifact('te_y_ft_titanic.pkl', doTrTeSplit)

# Model Development

First cut at model development

In [9]:
@jarvis.func
def trainModel(X_tr, Y_tr, n_estimators, min_samples_split):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split)
    model.fit(X_tr, Y_tr)
    return model

doTrainModel = jarvis.Action(trainModel, [X_tr, Y_tr, jarvis.Literal(10), jarvis.Literal(2)])
model = jarvis.Artifact('model.pkl', doTrainModel)

In [10]:
@jarvis.func
def scoreModel(model, X_tr, X_te, Y_tr, Y_te):
    tr_acc = "Train Accuracy: {}".format(model.score(X_tr, Y_tr))
    te_acc = "Test Accuracy: {}".format(model.score(X_te, Y_te))
    return (tr_acc + '\n' + te_acc, )

doScoreModel = jarvis.Action(scoreModel, [model, X_tr, X_te, Y_tr, Y_te])
output = jarvis.Artifact('output.txt', doScoreModel)

In [11]:
output.peek(lambda x: print(''.join(x)))

Train Accuracy: 1.0
Test Accuracy: 1.0



**Error!!!** 

The accuracy is too high!  We must have a feature that contains the label

In [12]:
ft_titanic_data.peek(lambda x: x.dropna().columns)

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male',
       'alone', 'sex_female', 'sex_male', 'embarked_C', 'embarked_Q',
       'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child',
       'who_man', 'who_woman', 'deck_A', 'deck_B', 'deck_C', 'deck_D',
       'deck_E', 'deck_F', 'deck_G', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no',
       'alive_yes'],
      dtype='object')

Notice the **alive_no** and **alive_yes** columns appear to have same data as survived.  Need to drop these columns

# Re-make the training matrices

In [13]:
@jarvis.func
def separateLabels(df):
    data = df.dropna()
    Y = data['survived'].values
    X = data.drop(['survived', 'alive_no', 'alive_yes'], axis=1).values.astype('float')
    return X, Y

doSepLabels = jarvis.Action(separateLabels, [ft_titanic_data])
X_ft_titanic_data = jarvis.Artifact('x_ft_titanic.pkl', doSepLabels)
Y_ft_titanic_data = jarvis.Artifact('y_ft_titanic.pkl', doSepLabels)

# Train Test Split (Again)

In [14]:
doTrTeSplit = jarvis.Action(trainTestSplit, [X_ft_titanic_data, Y_ft_titanic_data, jarvis.Literal(0.1), jarvis.Literal(42)])
X_tr = jarvis.Artifact('tr_x_ft_titanic.pkl', doTrTeSplit)
X_te = jarvis.Artifact('te_x_ft_titanic.pkl', doTrTeSplit)
Y_tr = jarvis.Artifact('tr_y_ft_titanic.pkl', doTrTeSplit)
Y_te = jarvis.Artifact('te_y_ft_titanic.pkl', doTrTeSplit)

# Model Development (Again)

First cut at model development

In [15]:
doTrainModel = jarvis.Action(trainModel, [X_tr, Y_tr, jarvis.Literal(10), jarvis.Literal(2)])
model = jarvis.Artifact('model.pkl', doTrainModel)

In [16]:
doScoreModel = jarvis.Action(scoreModel, [model, X_tr, X_te, Y_tr, Y_te])
output = jarvis.Artifact('output.txt', doScoreModel)

In [17]:
output.peek(lambda x: print(''.join(x)))

Train Accuracy: 0.9610591900311527
Test Accuracy: 0.75



In [18]:
output.pull()

# Model selection through search

**To be continued after Aggregation is implemented ...**