### Eliptic Envelope (Outlier detection)

In [1]:
import numpy as np
from sklearn.covariance import EllipticEnvelope

In [2]:
true_cov = np.array([[.8, .3], [.3, .4]])
X = np.random.RandomState(0).multivariate_normal(mean=[0,0], cov= true_cov, size=500)

In [3]:
cov = EllipticEnvelope(random_state=0).fit(X)
cov.predict([[0, 0], [3, 3]])

array([ 1, -1])

In [4]:
cov.covariance_

array([[0.74118335, 0.25357049],
       [0.25357049, 0.30531502]])

In [6]:
cov.get_precision()

array([[ 1.88470228, -1.56528451],
       [-1.56528451,  4.57530697]])

### Regression Evaluation metrics

In [7]:
import numpy as np

def get_simple_regression_samples(n, b0=-0.3, b1=0.5, error=0.2, seed=None):
    if seed:
        np.random.seed(seed)
        
    trueX = np.random.uniform(-1, 1, n)
    trueT = b0 + (b1*trueX)
    return np.array([trueX]).T, trueT + np.random.normal(0, error, n)

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

X, y_true = get_simple_regression_samples(20, -0.3, 0.5, seed=42)
reg = LinearRegression()
reg.fit(X, y_true)

reg.score(X, y_true)

y_pred = reg.predict(X)

print(np.sqrt(mean_squared_error(y_pred, y_true)))
print(round(np.std(y_pred-y_true), 3))


0.17611465861417525
0.176


### Classifier Evaluation metrics

In [1]:
from sklearn.metrics import classification_report

y_true = [0,1,0,2,1,0,0,2,1,1,0,1]
y_pred = [0,0,0,2,1,0,2,1,2,1,0,1]

target_names = ['retained customer', 'unretained customer', 'on hold customer']
print(classification_report(y_true, y_pred, target_names=target_names))

                     precision    recall  f1-score   support

  retained customer       0.80      0.80      0.80         5
unretained customer       0.75      0.60      0.67         5
   on hold customer       0.33      0.50      0.40         2

           accuracy                           0.67        12
          macro avg       0.63      0.63      0.62        12
       weighted avg       0.70      0.67      0.68        12



In [3]:
from sklearn.metrics import confubsion_matrix

cm = confusion_matrix(y_true, y_pred)
print(cm)

[[4 0 1]
 [1 3 1]
 [0 1 1]]


### Watson Developer Cloud Python SDK

In [4]:
%pip install --upgrade ibm-watson

Collecting ibm-watson
  Downloading ibm-watson-4.7.1.tar.gz (385 kB)
[K     |████████████████████████████████| 385 kB 1.0 MB/s eta 0:00:01
Collecting websocket-client==0.48.0
  Downloading websocket_client-0.48.0-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 4.2 MB/s eta 0:00:01
[?25hCollecting ibm_cloud_sdk_core==1.7.3
  Downloading ibm-cloud-sdk-core-1.7.3.tar.gz (27 kB)
Collecting PyJWT>=1.7.1
  Downloading PyJWT-1.7.1-py2.py3-none-any.whl (18 kB)
Building wheels for collected packages: ibm-watson, ibm-cloud-sdk-core
  Building wheel for ibm-watson (setup.py) ... [?25ldone
[?25h  Created wheel for ibm-watson: filename=ibm_watson-4.7.1-py3-none-any.whl size=379130 sha256=29d672d76e234e9a0f07314e6cd8fe311818c60bc6f09ab23a8f90273e03f492
  Stored in directory: /Users/kenrysanchez/Library/Caches/pip/wheels/d1/4f/81/0101a1b7978cb74cc02ea6eed0c34f2349cfa07c0e82f4f151
  Building wheel for ibm-cloud-sdk-core (setup.py) ... [?25ldone
[?25h  Created whee

### Bootstraping Samples Random Forrest

In [3]:
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

np.random.seed(0)
iris = load_iris()
X, y = iris.data, iris.target
indices = np.arange(y.shape[0])
np.random.shuffle(indices)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

bm_name = ['KNN','DT ','SVM']
for bm, basemodel in enumerate([KNeighborsClassifier(), DecisionTreeClassifier(), SVC(kernel='rbf')]):
    clf = BaggingClassifier(basemodel, n_estimators=10,
                            max_samples=0.5, max_features=0.5)

    pipe  = Pipeline(steps=[('scaler', StandardScaler()),
                            ('bagged_clf', clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(bm_name[bm], "f1_score", round(f1_score(y_test, y_pred,average='weighted'), 3))


KNN f1_score 0.933
DT  f1_score 0.967
SVM f1_score 0.967


### Neuronal Network

In [4]:
import sys
import os
import re
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

def load_data():

    data_dir = os.path.join("..", "data")
    df = pd.read_csv(os.path.join(data_dir, r"aavail-target.csv"))

    ## pull out the target and remove uneeded columns
    _y = df.pop('is_subscriber')
    y = np.zeros(_y.size)
    y[_y==0] = 1 
    df.drop(columns=['customer_id', 'customer_name'], inplace=True)
    return(y, df)


## variables
rs = 42

## preprocessing pipeline
numeric_features = ['age', 'num_streams']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_features = ['country', 'subscriber_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

if __name__ == "__main__":

    ## load the data
    y,df = load_data()
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, stratify=y, random_state=rs)
    
    pipe  = Pipeline(steps=[('preprocessor', preprocessor),
                            ('nn', MLPClassifier(alpha=1e-5,random_state=rs))])


    param_grid = {
    'nn__activation': ['logistic', 'tanh', 'relu'],
    'nn__solver': ['lbfgs', 'sgd','adam'],
    'nn__hidden_layer_sizes': [(10,10), (50,50), (100,100), (50,50,50), (100,100,100)]
    }

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, iid=False, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    ## extract feature names 
    pipe.fit(X_train, y_train)
    feature_names = np.hstack([numeric_features,
                               preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names()])
    feature_names = [re.sub("x\d_","",fn) for fn in feature_names]
    target_names = ['subscriber','churned']
    print(feature_names)
    print(classification_report(y_test, y_pred, target_names=target_names))

FileNotFoundError: [Errno 2] No such file or directory: '../data/aavail-target.csv'

In [None]:
import joblib
import time

saved_model = 'my-saved-model.joblib'
if not os.path.exists(saved_model):
    time_start = time.time()
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, iid=False, n_jobs=-1)
    grid.fit(X_train, y_train)
    joblib.dump(grid, saved_model)
    print("train time", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
else:
    print("loading {} from file".format(saved_model))
    grid = joblib.load(saved_model)

## print best parameters
print(grid.best_params_)

### IBM Watson Python SDK

In [5]:
import sys
import os
import json
from ibm_watson import VisualRecognitionV3
from ibm_watson.visual_recognition_v4 import FileWithMetadata, TrainingDataObject, Location, AnalyzeEnums
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

### import API key
apikey_dir = os.path.join(os.path.expanduser("~"),".ibm")
sys.path.append(apikey_dir)

if not os.path.exists(apikey_dir):
    raise Exception("please store you API key in file within 'apikey_dir' before proceeding")

from ibmauth import VR_KEY, VR_URL, VR_VERSION

def connect_watson_vr():
    """
    establish a connection to watson vr service
    """
    
    authenticator = IAMAuthenticator(VR_KEY)
    service = VisualRecognitionV3(version=VR_VERSION,
                                  authenticator=authenticator)

    service.set_service_url(VR_URL)

    print("\nConnection established.\n")
    return(service)

if __name__ == "__main__":
    
    service = connect_watson_vr()

    ## classify an image from a URL
    image_url = "https://watson-developer-cloud.github.io/doc-tutorial-downloads/visual-recognition/fruitbowl.jpg"
    fruitbowl_results = service.classify(url=image_url,
                                         threshold='0.1',
                                         classifier_ids=['food']).get_result()
    print(json.dumps(fruitbowl_results, indent=2))


Connection established.



ConnectTimeout: HTTPSConnectionPool(host='iam.cloud.ibm.com', port=443): Max retries exceeded with url: /identity/token (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x116e8f2e0>, 'Connection to iam.cloud.ibm.com timed out. (connect timeout=60)'))