In [87]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
import pandas as pd
import numpy as np
import joblib
import boto3

This is your AWS profile name set locally:

In [125]:
profile_name = "mfa"

In [132]:
boto_session = boto3.session.Session(profile_name=profile_name)

In [174]:
churn = pd.read_csv("./churn.txt")
pd.set_option("display.max_columns", 500)
churn

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,PA,163,806,403-2562,no,yes,300,8.162204,3,7.579174,3.933035,4,6.508639,4.065759,100,5.111624,4.928160,6,5.673203,3,True.
1,SC,15,836,158-8416,yes,no,0,10.018993,4,4.226289,2.325005,0,9.972592,7.141040,200,6.436188,3.221748,6,2.559749,8,False.
2,MO,131,777,896-6253,no,yes,300,4.708490,3,4.768160,4.537466,3,4.566715,5.363235,100,5.142451,7.139023,2,6.254157,4,False.
3,WY,75,878,817-5729,yes,yes,700,1.268734,3,2.567642,2.528748,5,2.333624,3.773586,450,3.814413,2.245779,6,1.080692,6,False.
4,WY,146,878,450-4942,yes,no,0,2.696177,3,5.908916,6.015337,3,3.670408,3.751673,250,2.796812,6.905545,4,7.134343,6,True.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,NH,4,787,151-3162,yes,yes,800,10.862632,5,7.250969,6.936164,1,8.026482,4.921314,350,6.748489,4.872570,8,2.122530,9,False.
4996,SD,140,836,351-5993,no,no,0,1.581127,8,3.758307,7.377591,7,1.328827,0.939932,300,4.522661,6.938571,2,4.600473,4,False.
4997,SC,32,836,370-3127,no,yes,700,0.163836,5,4.243980,5.841852,3,2.340554,0.939469,450,5.157898,4.388328,7,1.060340,6,False.
4998,MA,142,776,604-2108,yes,yes,600,2.034454,5,3.014859,4.140554,3,3.470372,6.076043,150,4.362780,7.173376,3,4.871900,7,True.


In [178]:
churn["Area Code"].unique()

array([806, 836, 777, 878, 866, 737, 766, 657, 676, 827, 716, 786, 778,
       788, 787, 736, 686, 876, 858, 877, 776, 797, 707, 677, 727, 758,
       659, 678, 848, 658, 798, 847, 868], dtype=object)

In [175]:
churn = churn.drop("Phone", axis=1)
churn["Area Code"] = churn["Area Code"].astype(object)

In [176]:
churn = churn.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

In [177]:
train_data, validation_data, test_data = np.split(
    churn.sample(frac=1, random_state=1729),
    [int(0.7 * len(churn)), int(0.9 * len(churn))],
)

In [150]:
X_train, y_train = train_data.iloc[:, 0:-1], train_data.iloc[:, -1]
X_train.shape, y_train.shape

((3500, 15), (3500,))

In [151]:
class PandasGetDummies(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, (pd.Series, pd.DataFrame))
        return pd.get_dummies(X)

In [152]:
clf = RandomForestClassifier(
    max_depth=5,
    random_state=0
)

In [153]:
pipe = Pipeline(
    [('OneHotEncoder', PandasGetDummies()), ("rf", clf)]
)

In [154]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('OneHotEncoder', PandasGetDummies()),
                ('rf', RandomForestClassifier(max_depth=5, random_state=0))])

In [171]:
pipe.predict_proba(validation_data.iloc[:, 0:-1])

array([[0.53713397, 0.46286603],
       [0.34031858, 0.65968142],
       [0.2363877 , 0.7636123 ],
       ...,
       [0.3429229 , 0.6570771 ],
       [0.35934799, 0.64065201],
       [0.77737358, 0.22262642]])

In [44]:
joblib.dump(clf, "sklearn_model.joblib")

['sklearn_model.joblib']

Test the model loading it:

In [46]:
clf2 = joblib.load("sklearn_model.joblib")

In [64]:
clf2.predict_proba(test_data.iloc[3, 1:].values.reshape(1, -1))[0][0]

0.729533309936247