# Import Python Libraries

In [14]:
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np
from joblib import dump
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
import pandas as pd
from dotenv import dotenv_values

In [15]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Download and Enable Db2 Magic Commands Extension for Jupyter Notebook

In [2]:
!wget https://raw.githubusercontent.com/IBM/db2-jupyter/master/db2.ipynb
%run db2.ipynb

--2024-05-13 13:53:48--  https://raw.githubusercontent.com/IBM/db2-jupyter/master/db2.ipynb
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 153545 (150K) [text/plain]
Saving to: ‘db2.ipynb’


2024-05-13 13:53:48 (16.2 MB/s) - ‘db2.ipynb’ saved [153545/153545]

         Install pydatagrid if you want to enable scrolling of result sets.
Db2 Extensions Loaded.


## Load db2 connection config from the `db2con.env` file

In [4]:
db2creds = dotenv_values('db2con.env')
%sql CONNECT CREDENTIALS db2creds

Connection successful. db2ml @ localhost 


# Load ML dataset from Db2

In [12]:
query = %sql SELECT * FROM FLIGHTS_TRAIN
df = pd.DataFrame(query)

cols_show = ['MONTH','DAYOFWEEK', 'UNIQUECARRIER', 'ORIGIN', 'DEST', 'DEPDELAY', 'FLIGHTSTATUS']

print('shape of the dataset: ', df.shape)

print('sample rows from the dataset:')
df[cols_show].sample(5)

shape of the dataset:  (160000, 19)
sample rows from the dataset:


Unnamed: 0,MONTH,DAYOFWEEK,UNIQUECARRIER,ORIGIN,DEST,DEPDELAY,FLIGHTSTATUS
113095,3,7,MQ,DFW,SGF,29.0,1
48823,1,6,DL,VPS,ATL,11.0,1
4380,6,5,DL,CVG,SLC,4.0,1
116504,10,1,DL,TPA,CVG,-5.0,0
56054,5,7,XE,LFT,IAH,-7.0,0


# Build and Train a Classification Model using Scikit-Learn

In [19]:
# Randomly split the dataset into 2 datasets: train and test sets. The test set has 20% of the original samples. 
# The remaining 80% of the samples remain with the train set. 
df_test = df.sample(frac=0.20)
df_train = df.drop(df_test.index)

# select the subset of columns as the input features for the ML model
input_cols = ['YEAR','QUARTER', 'MONTH',
                      'DAYOFMONTH', 'DAYOFWEEK','UNIQUECARRIER',
                      'ORIGIN', 'DEST', 'CRSDEPTIME',
                      'DEPDELAY', 'DEPDEL15','TAXIOUT','WHEELSOFF',
                      'CRSARRTIME', 'CRSELAPSEDTIME', 'AIRTIME', 'DISTANCEGROUP']

# select the class label, the target column, for the classification model
target = ['FLIGHTSTATUS']

X = df_train[input_cols]
y = df_train[target].astype('int')

# define the strategy to fill in missing values in the numeric columns
num_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value=0), 
                            MaxAbsScaler())

# define the strategy to fill in missing values in the categorical columns
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(handle_unknown='ignore'))

# combine the previous 2 pipelines into a data preproessing pipeline. 

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include='object'))
)

# create a final pipeline by chaining data preprocessing and a learning algorithm, `LogisticRegression`

pipe_lr = make_pipeline(preprocessing, 
                       LogisticRegression(random_state=1,
                                         solver='lbfgs'))

# train the model using the training set features and class labels
pipe_lr.fit(X, y)

# evaluate the trained model using the test set
X_test = df_test[input_cols]
y_test = df_test[target].astype('int')

predictions = pipe_lr.predict(X_test)

# compute the print the model accuracy

print('Accuracy: ', pipe_lr.score(X_test, y_test) * 100)

# retrain the model using the complete dataset 

pipe_lr.fit(df[input_cols], df[target].astype('int'))

Accuracy:  87.171875


# Serialize and export the trained model pipeline

In [20]:
dump(pipe_lr, 'myudf_lr.joblib')

['myudf_lr.joblib']

# Create the UDF on Db2

In [21]:
%%sql
CREATE OR REPLACE FUNCTION MYUDF_LR(
    INTEGER,
    INTEGER,
    INTEGER,
    INTEGER,
    INTEGER,
    INTEGER,
    VARCHAR(50),
    VARCHAR(50),
    VARCHAR(50),
    INTEGER,
    REAL,
    REAL,
    INTEGER,
    INTEGER,
    INTEGER,
    INTEGER,
    INTEGER,
    INTEGER
) 
RETURNS TABLE (
    "YEAR" INTEGER,
    "QUARTER" INTEGER,
    "MONTH" INTEGER,
    "DAYOFMONTH" INTEGER,
    "DAYOFWEEK" INTEGER,
    "UNIQUECARRIER" VARCHAR(50),
    "ORIGIN" VARCHAR(50),
    "DEST" VARCHAR(50),
    "CRSDEPTIME" INTEGER,
    "DEPDELAY" REAL,
    "DEPDEL15" REAL,
    "TAXIOUT" INTEGER,
    "WHEELSOFF" INTEGER,
    "CRSARRTIME" INTEGER,
    "CRSELAPSEDTIME" INTEGER,
    "AIRTIME" INTEGER,
    "DISTANCEGROUP" INTEGER,
    "FLIGHTSTATUS_PREDICTION" INTEGER
)
LANGUAGE PYTHON 
PARAMETER STYLE NPSGENERIC  
FENCED  
NOT THREADSAFE  
NO FINAL CALL  
DISALLOW PARALLEL  
NO DBINFO  
DETERMINISTIC 
NO EXTERNAL ACTION 
CALLED ON NULL INPUT  
NO SQL 
EXTERNAL NAME '/home/shaikhq/db2-pythonudf-tutorial/myudf_lr.py'

Command completed.


# Generate Predictions using this UDF

In [22]:
%%sql
SELECT f.*
FROM FLIGHTS_TEST i, 
     TABLE(
         MYUDF_LR(
             (SELECT COUNT(*) FROM FLIGHTS_TEST),
             i."YEAR",
             i."QUARTER",
             i."MONTH",
             i."DAYOFMONTH",
             i."DAYOFWEEK",
             i."UNIQUECARRIER",
             i."ORIGIN",
             i."DEST",
             i."CRSDEPTIME",
             i."DEPDELAY",
             i."DEPDEL15",
             i."TAXIOUT",
             i."WHEELSOFF",
             i."CRSARRTIME",
             i."CRSELAPSEDTIME",
             i."AIRTIME",
             i."DISTANCEGROUP"
         )
     ) f

Unnamed: 0,YEAR,QUARTER,MONTH,DAYOFMONTH,DAYOFWEEK,UNIQUECARRIER,ORIGIN,DEST,CRSDEPTIME,DEPDELAY,DEPDEL15,TAXIOUT,WHEELSOFF,CRSARRTIME,CRSELAPSEDTIME,AIRTIME,DISTANCEGROUP,FLIGHTSTATUS_PREDICTION
0,2011,4,12,4,7,DL,ATL,BNA,825,-5.0,0.0,26,846,838,73,39,1,0
1,2013,4,10,9,3,9E,ATW,MSP,1730,-15.0,0.0,18,1733,1835,65,49,1,0
2,2014,1,2,1,6,EV,ATL,MGM,1135,-3.0,0.0,8,1140,1128,53,40,1,0
3,2013,3,8,5,1,EV,SAT,DTW,1642,-1.0,0.0,17,1658,2054,192,153,5,0
4,2016,4,12,21,3,OO,MSP,DAY,1010,-5.0,0.0,30,1035,1312,122,71,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,2013,2,6,24,1,OO,SGU,DEN,1331,-6.0,0.0,8,1333,1509,98,75,3,0
39996,2017,4,12,10,7,UA,BOS,IAD,1941,8.0,0.0,22,2011,2120,99,73,2,1
39997,2011,2,6,13,1,MQ,JFK,RDU,1225,-8.0,0.0,21,1238,1400,95,77,2,1
39998,2017,4,12,20,3,WN,MDW,PHL,1415,21.0,1.0,10,1446,1705,110,88,3,1
