In [None]:
# importing some basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

import os
import gc # garbage collector to free GPU memory
from tqdm import tqdm # visualization of progress in loops


In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
#   adding some commands for multi-GPU utilization
# ::::::::::::::::::::::::::::::::::::::::::::::::::::

from dask_cuda import LocalCUDACluster
from dask.distributed import Client, wait

# boundary conditions to ristric using Teslas
n_workers = 2 # restricting the usage of the GPU to avoid utilizing the Quadro P4000
n_streams = 8 # Performance optimization - has been 8

# This will use all GPUs on the local host by default
cluster = LocalCUDACluster(n_workers= n_workers, threads_per_worker=1)
c = Client(cluster)

# Query the client for all connected workers
workers = c.has_what().keys()

print("which worker will be used: ", workers)


In [None]:
# loading kaggle API to download the data set needed
import kaggle

dataset_name= "micropcb-images.zip" # name of the zip-file, which is getting downloaded from kaggle
dataset_base_folder= "/media/igor/FEM-storage-Linux/Phython_project_datasets"
dataset_destination= dataset_base_folder+ "/PCB_defects_dataset/"
notebook_location= "/media/igor/Linux_drive-A/Eigene_Dateien/Python_project/pythonProject-007_GPGPUs"

check_zip=0  # control variable set= 1 if zip-file already has been downloaded
check_data=0 # control variable set= 1 if dataset already has been extracted from zip-file

print("1.1 looping through the notebook directory to check if the dataset has been downloaded already")
for file_path in os.listdir(notebook_location): # looping through notebook directory to check if dataset has been downloaded already
    print(file_path)
    if os.path.isfile(os.path.join(notebook_location, file_path)):
        if file_path == dataset_name: # compare the current file name with the deisred one
            print("*** dataset exists ***")
            check_zip=1
            break
if check_zip==0:
    print("\n".join(("dataset is not yet found in notebook directory", "checking dataset destination for extracted files...")))

print("2.1 checking if dataset has been extracted already to file destination")
for file_path in os.listdir(dataset_base_folder): # looping through the file destination in order to check if they have been extracted already
    if os.path.isdir(os.path.join(dataset_base_folder, file_path)):
        print(file_path)
        if file_path==dataset_destination.split("/")[-2]:
            print("*** files are already extracted from zip-file ***")
            check_data=1
if check_data==0:
    print("\n".join(("dataset is not yet found in file destination", "files are now either extracted or downloaded first and then extracted to its destination")))


# downloading the dataset from kaggle
if check_zip==0 and check_data==0:
    print("2.2 start downloading the dataset from kaggle")
    # !kaggle datasets download -d dataset_lacation
    check_zip=1

# generating a folder on the local machine and unzipping the files
if check_data==0:
    print("2.3 generating the file destination folder: ", dataset_destination)
    # os.makedirs(dataset_destination, exist_ok=True)

# unzipping the files
if check_zip==1 and check_data==0:
    print("2.4 extracting the dataset to its file destination")
    # !unzip micropcb-images.zip -d /media/igor/FEM-storage-Linux/Phython_project_datasets/PCB_defects_dataset
    check_data=1

# remoev zip-file after extracting the data
if check_zip==1 and check_data==1:
    print("2.5 removing the zip-file from notebook directory to free disk space")
    #os.remove(dataset_name) # the zip-file needs to be in the same folder as the notebook


In [None]:
# defining some basic settings
if check_data==1:
    print("dataset location is on SSD")
    directory_settings= dataset_destination
else:
    print("dataset location is on HDD")
    directory_settings= "/media/igor/Linux_drive-A/Eigene_Dateien/Python_project/pythonProject-007_GPGPUs/data-sets/PCB_dataset_defects/"
# directory_settings= "/home/igor/Eigene_Dateien/Python_project/pythonProject-007_GPGPUs/data-sets/PCB_dataset_defects/"

directory_train= directory_settings + "train_coded/train_coded"
directory_test=  directory_settings + "test_coded/test_coded"



In [None]:
#some helper-functions

# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# reading a folder and returning its file names
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
def file_names(location):
    return os.listdir(location)

# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# generating a test and training DataFrame
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
def df_gen(_type):
    DataFrame_gen= 0
    for i, name in enumerate(file_names(directory_settings)):
        print("currently checking: ", name)
        if (name[-3:]== "csv") & (DataFrame_gen==0) & (name.split("_")[0]==_type):
            print("generating DataFrame....")
            df= pd.read_csv(directory_settings + name)
            DataFrame_gen=1
            continue
        if (name[-3:]== "csv") & (DataFrame_gen==1) & (name.split("_")[0]==_type):
            print(f"...adding {name} to my DataFrame ")
            df_temp= pd.read_csv(directory_settings + name)
            df_temp=df_temp.drop("Image", axis=1)
            df= pd.concat([df, df_temp], axis=1)
        else:
            print("jumping to next file...")
            continue
    return df

# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# getting some additional info from the file-name codings
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
def dummy_gen(df, array, char0, char1, char2, char3):
    for i, name in enumerate(array):
        char= vars()["char"+ str(i)] # changing the character dictianory to the ones supplied above
        df[name]= df["Image"].apply(lambda x: char[x[i]])

    return df

# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::::::::::: garbage collector ::::::::::::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
'''https://github.com/rapidsai/cuml/issues/5769'''
'''     thanks to: immanuelazn     '''
class GarbageCollector(): # BaseEstimator, TransformerMixin
    # this will free GPU memory after each itereation in GridSearchCV
    def __init__(self):
        pass
    def fit(self, X, y = None):
        gc.collect()
        return self
    def transform(self, X):
        gc.collect()
        return X


# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# :::::::::::::: background detection ::::::::::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
def background(img, threshold, pixel): # common values threshold= 160, pixel=35
    # imArray= cv2.imread(img)
    # color conversion
    imArray= cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # adaptive background definition of the Gray-scale image
    return cv2.adaptiveThreshold(imArray, threshold, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,pixel,5)



In [None]:
# loading my definitions into a DataFrame

df_train= df_gen("train")
df_test=  df_gen("test")

# changing two identical column names
df_train.columns= ['Image', 'Angle', 'PCB_Left', 'PCBTop', 'PCB_Width', 'PCB_Height',
                   'Ratio_of_T-2-B_Edge_Length', 'Img_Width','Img_Height']
df_test.columns= ['Image', 'Angle', 'PCB_Left', 'PCBTop', 'PCB_Width', 'PCB_Height',
                  'Ratio_of_T-2-B_Edge_Length', 'Img_Width','Img_Height']

# reverse engineering the file-name-coded specs
# origin is the read-me text file
_1st_char= {"A": "Raspberry Pi A+", "B": "Arduino Mega 2560 (Blue)", "C":"Arduino Mega 2560 (Black)", 
                "D": "Arduino Mega 2560 (Black and Yellow)", "E": "Arduino Due", "F": "Beaglebone Black",
                "G": "Arduino Uno (Green)", "H": "Raspberry Pi 3 B+", "I": "Raspberry Pi 1 B+", 
                "J": "Arduino Uno Camera Shield", "K": "Arduino Uno (Black)", "L": "Arduino Uno WiFi Shield",
                "M": "Arduino Leonardo"}
char_1= {"A": 0, "B": 1, "C":2, "D": 3, "E": 4, "F": 5,"G": 6, "H": 7, "I": 8,"J": 9, "K": 10, "L": 11,"M": 12}
_2nd_char= {"A": "Wide left rotation", "B": "Shallow left rotation", "C": "Neutral rotation",
                "D": "Shallow right rotation", "E": "Wide right rotation"}

_3rd_char= {"A": "12 inches left of the camera position", "B": "6 inches left of the camera position",
                "C": "centered horizontally relative to the camera", "D": "6 inches right of the camera position",
                "E": "12 inches right of the camera position"}
_4th_char= _3rd_char
_5th_char= {"1": "Images numbered 1-4 are train images", "2": "Images numbered 1-4 are train images",
                "3": "Images numbered 1-4 are train images", "4": "Images numbered 1-4 are train images",
                "5": "Images numbered 5 are test images"}
char_def= ["model", "totation_type", "x-coord._capture", "y-coord._capture"]


print("including the specs from the file-name-coding into my DataFrame")
df_train= dummy_gen(df_train, char_def, _1st_char, _2nd_char, _3rd_char, _4th_char)
df_test=  dummy_gen(df_test, char_def, _1st_char, _2nd_char, _3rd_char, _4th_char)

print("::::::::::::::::::::::::::::::::")
print(":::::::::  finished  :::::::::::")
print("::::::::::::::::::::::::::::::::")

In [None]:
# get some dummy values from Pandas
df_train_dummy= pd.get_dummies(df_train.drop("Image", axis=1), drop_first=True)
df_train_dummy["Image"]= df_train["Image"]
df_test_dummy=  pd.get_dummies(df_test.drop("Image", axis=1), drop_first=True)
df_test_dummy["Image"]= df_test["Image"]

df_train_dummy.head()

print("::::::::::::::::::::::::::::::::")
print(":::::::::  finished  :::::::::::")
print("::::::::::::::::::::::::::::::::")

In [None]:
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::::::::::: dictionary generation :::::::::::::::::
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::

print("generating file locations for my train dataset")
file_names_train, file_names_test = np.array([]) ,np.array([])
pbar = tqdm(file_names(directory_train), total=len(file_names(directory_train)))
for i, img_name in enumerate(pbar):
    file_names_train= np.append(file_names_train, directory_train+ "/"+ file_names(directory_train)[i])

    
print("generating file locations for my test dataset")
pbar = tqdm(file_names(directory_test), total=len(file_names(directory_test)))
for i, img_name in enumerate(pbar):
    file_names_test= np.append(file_names_test, directory_test+ "/"+ file_names(directory_test)[i])


In [None]:
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::
# :::::::::::::::: some exploratory DA ::::::::::::::::::
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::

print("analizing image dimensions and total pixel count for train DS")

picture_stats_train, picture_stats_test= np.array([]), np.array([])

# train data set (DS)
pbar = tqdm(file_names_train, total=len(file_names_train))
for img_name in pbar:
    image_data= cv2.imread(img_name)
    X, y = image_data.shape[0], image_data.shape[1]
    picture_stats_train= np.append(picture_stats_train, [X, y, X* y])

picture_stats_train= picture_stats_train.reshape(len(file_names_train), 3)
df_stats_train= pd.DataFrame(picture_stats_train, columns=["pixel_X", "pixel_y", "total_pixel"])

print("analizing image dimensions and total pixel count for test DS")
# test data set (DS)
pbar = tqdm(file_names_test, total=len(file_names_test))
for img_name in pbar:
    image_data= cv2.imread(img_name)
    X, y = image_data.shape[0], image_data.shape[1]
    picture_stats_test= np.append(picture_stats_test, [X, y, X* y])

picture_stats_test= picture_stats_test.reshape(len(file_names_test), 3)
df_stats_test= pd.DataFrame(picture_stats_test, columns=["pixel_X", "pixel_y", "total_pixel"])

# visualization of the obtained information about the pictures in my data set
fig, axn= plt.subplots(1,2, figsize=(10,5))
sns.pointplot(data=df_stats_train, x= 1, y="pixel_X", errorbar= "sd", label= "pixel count X-axis (train DS)", ax= axn[0])
sns.pointplot(data=df_stats_train, x= 2, y="pixel_y", errorbar= "sd", label= "pixel count y-axis (train DS)", ax= axn[0])
sns.histplot(data=df_stats_train, x="total_pixel", label= "picture resolution (train DS)", ax= axn[1])

sns.pointplot(data=df_stats_test, x= 1.2, y="pixel_X", errorbar= "sd", marker="D", label= "pixel count X-axis (test DS)", ax= axn[0])
sns.pointplot(data=df_stats_test, x= 2.2, y="pixel_y", errorbar= "sd", marker="D", label= "pixel count y-axis (test DS)", ax= axn[0])
sns.histplot(data=df_stats_test, x="total_pixel", label= "picture resolution (test DS)", ax= axn[1])

[axn[i].legend() for i in range(2)]
[axn[i].grid(True) for i in range(2)]
fig.tight_layout()
plt.show()


In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# :::: visualization of my transformed test image ::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::::

img_size= 64 # 64
pixel= 3
threshold= 165

image_loc= file_names_train[1]
print("file name to load: \n", image_loc)

img= cv2.imread(image_loc)
image_red= cv2.resize(img, (img_size, img_size))

imArray= cv2.cvtColor(image_red, cv2.COLOR_BGR2GRAY)
# adaptive background definition of the Gray-scale image
imArray= cv2.adaptiveThreshold(imArray, threshold, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,pixel,5)
tmp1= cv2.createBackgroundSubtractorMOG2().apply(imArray)
tmp2= imArray
tmp3= cv2.createBackgroundSubtractorKNN().apply(image_red)

fig, axn= plt.subplots(2,2, figsize=(10,10))
axn[0,0].imshow(tmp1, cmap= "gray")
axn[0,1].imshow(tmp2, cmap= "gray")
axn[1,0].imshow(img, cmap= "gray")
axn[1,1].imshow(image_red, cmap= "gray")

plt.show()

In [None]:
# :::::::::::::::::::::::::::::::::::::::::::
# this is with background subtraction methode
# :::::::::::::::::::::::::::::::::::::::::::

import cupy as cp
import cudf

# **************************
verbose_01= 1 # can be used to enable an additional contrast picture if set= 1
# **************************

# stagging my cropped image together with my wavelet transforemd data into a numpy array for model-training
versions = 13 # dataset comprises of 13 differenct PCBs
lim_train= 500* versions -1  # not the full size yet: len(file_names(directory_train))
lim_test = 125* versions -1  # not the full size yet: len(file_names(directory_test))
if verbose_01==1: 
    img_size= 64 # 76
    bit= 4
else: 
    img_size=76 # 76
    bit=3
X_train, y_train= [], []
X_test, y_test= [], []

# :: X_train_cu, y_train_cu= cp.array([]), cp.array([])
# :: X_test_cu, y_test_cu= cp.array([]), cp.array([])

pbar = tqdm(file_names_train, total=lim_train)
for num,training_image in enumerate(pbar):
    # img= cv2.imread(training_image)
    img=cv2.resize(cv2.imread(training_image), (img_size,img_size))
    if img is None:
        continue
    scalled_raw_img = img
    imArray= cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    scalled_img_haar= cv2.adaptiveThreshold(imArray, 165, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,3,5)
    # scalled_img_haar= cv2.resize(img_haar, (img_size,img_size))
    if verbose_01==1: combined_img= np.vstack((scalled_raw_img.reshape(img_size*img_size*3,1), scalled_img_haar.reshape(img_size*img_size,1)))
    else: combined_img= scalled_raw_img.reshape(img_size*img_size*3,1)
    X_train.append(combined_img)
    y_train.append(df_train["Image"].apply(lambda x: char_1[x[0]])[num]) #y_train.append(df_train["Image"].apply(lambda x: _1st_char[x[0]])[num])
    # CUDA version of numpy for GPU inferences
    # :: X_train_cu= cp.append(X_train_cu, combined_img)
    if num==lim_train: break # needs to be removed for full dataset

# simple conversion to CuPy
# :: y_train_cu= cudf.Series(y_train).to_cupy()

# reshaping my training data
y_train= np.array(y_train).astype(np.int32)
X_train= np.array(X_train).reshape(len(X_train),int(img_size**2* bit)).astype(np.float32) #.astype(float), with astype(int8) memory gets saved
# :: X_train_cu= cp.array(X_train_cu).reshape(num+1, int(img_size**2* bit))
print("the shape of the data in my train dataset is: ", X_train.shape)


pbar = tqdm(file_names_test, total=lim_test)
for num,training_image in enumerate(pbar):
    # img= cv2.imread(training_image)
    img= cv2.resize(cv2.imread(training_image), (img_size,img_size))
    if img is None:
        continue
    scalled_raw_img = img
    imArray= cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    scalled_img_haar= cv2.adaptiveThreshold(imArray, 165, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,3,5)
    # scalled_img_haar= cv2.resize(img_haar, (img_size,img_size))
    if verbose_01==1: combined_img= np.vstack((scalled_raw_img.reshape(img_size*img_size*3,1), scalled_img_haar.reshape(img_size*img_size,1)))
    else: combined_img= scalled_raw_img.reshape(img_size*img_size*3,1)
    X_test.append(combined_img)
    y_test.append(df_test["Image"].apply(lambda x: char_1[x[0]])[num])
    # CUDA version of numpy for GPU infenrences
    # :: X_test_cu= cp.append(X_test_cu, combined_img)
    if num==lim_test: break # needs to be removed for full dataset

# simple conversion to CuPy
# :: y_test_cu= cudf.Series(y_test).to_cupy()

# reshaping my training data
y_test= np.array(y_test).astype(np.int32)
X_test= np.array(X_test).reshape(len(X_test),int(img_size**2 * bit)).astype(np.float32) # .astype(float), with astype(int8) memory gets saved
# :: X_test_cu= cp.array(X_test_cu).reshape(num+1, int(img_size**2* bit))
print("the shape of the data in my test dataset is: ", X_test.shape)

print("::::::::::::::::::::::::::::::::")
print(":::::::::  finished  :::::::::::")
print("::::::::::::::::::::::::::::::::")

In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# ::: definition of the distribution to the workers:::
# :::::::::::::: sharing the DataFrame :::::::::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::::

import dask_cudf
from cuml.dask.common import utils as dask_utils

n_partitions = n_workers

# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# :::::::::::::::::::: taken from ::::::::::::::::::::
# https://github.com/rapidsai/cuml/blob/main/notebooks/random_forest_mnmg_demo.ipynb
# ::::::::::::::::::::::::::::::::::::::::::::::::::::
def distribute(X, y):
    print("loading the numpy arrays into Cuda DataFrames")
    # transfering normal DataFrame into cudf and move it to the GPU
    X_cudf= cudf.DataFrame(X)
    y_cudf= cudf.Series(y)
    
    print(f"the shape of my X_cudf is {X_cudf.shape} and from my y_cudf is {y_cudf.shape}")
    print("transforming cudf to dask_cudf and sharing it on the GPU")
    # now generate dask DataFrames for parallelization
    X_dask= dask_cudf.from_cudf(X_cudf, npartitions= n_partitions)
    y_dask= dask_cudf.from_cudf(y_cudf, npartitions= n_partitions)
    
    # Persist to cache the data in active memory
    X_dask, y_dask = \
      dask_utils.persist_across_workers(c, [X_dask, y_dask], workers=workers)
    
    return X_dask, y_dask

X_train_dask, y_train_dask= distribute(X_train, y_train)
X_test_dask, y_test_dask= distribute(X_test, y_test)

# ::::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::::

In [None]:
from cuml.metrics import accuracy_score
from cuml.dask.ensemble import RandomForestClassifier as cumlDaskRF


print("definition of the model")
cuml_model = cumlDaskRF(n_estimators=300, n_streams=1, random_state= 1984) # (max_depth=max_depth, n_estimators=n_trees, n_bins=n_bins, n_streams=n_streams)

print("fitting the model to my data-set")
cuml_model.fit(X_train_dask, y_train_dask)
wait(cuml_model.rfs) # Allow asynchronous training tasks to finish

# testing the model
print("validating the model....")
cuml_y_pred = cuml_model.predict(X_test_dask).compute().to_numpy()

# Due to randomness in the algorithm, you may see slight variation in accuracies
print(f"CuML accuracy:  {accuracy_score(y_test, cuml_y_pred):.3f}")


# ::::::::::::::::::::::::::::::::
# ::::::::::::::::::::::::::::::::

print("::::::::::::::::::::::::::::::::")
params= [100, 200, 300, 500, 750, 1200] # n_estimators for RandomForestLCassifier
res_array= [] # array to store the resutls of my selfmade parameter search

def cuml_model_est(i):
    return cumlDaskRF(n_estimators= i, n_streams= 1, random_state= 1984)

for i in range(int(len(params))):
    
    print(f"1.{i}. fitting the model to my data-set with ", params[i])
    cuml_model= cuml_model_est(params[i])
    cuml_model.fit(X_train_dask, y_train_dask)
    wait(cuml_model.rfs) # Allow asynchronous training tasks to finish

    # testing the model
    print(f"2.{i}. validating the model....")
    cuml_y_pred = cuml_model.predict(X_test_dask).compute().to_numpy()

    # Due to randomness in the algorithm, you may see slight variation in accuracies
    print(f"3.{i}. cuML accuracy for n_estimators= {params[i]}:  {accuracy_score(y_test, cuml_y_pred):.3f}")
    res_array.append([params[i], accuracy_score(y_test, cuml_y_pred)])

print("4. all results retrieced by the search:")
res_array
# ::::::::::::::::::::::::::::::::
# ::::::::::::::::::::::::::::::::


In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::: trying XGBoost for classification ::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::
#   this attempt needs two Nvidia Volta Series GPU
# ::::::::::::::::::::::::::::::::::::::::::::::::::

# attempts combining cuml.dask with dask_ml did not work out well.
# neither make_pipeline nor GridSearchCV did work for two GPUs at the same time

print("generating cudf based on my existing DataFrames")
# X_train_cudf, X_test_cudf= cudf.DataFrame(X_train).to_cupy(), cudf.DataFrame(X_test).to_cupy()
# y_train_cudf, y_test_cudf= cudf.Series(y_train).to_cupy(), cudf.Series(y_test).to_cupy()
# X_train_cupy, X_test_cupy= cp.ndarray(X_train), cp.ndarray(X_test)
# y_train_cupy, y_test_cupy= cp.array(y_train), cp.arange(y_test)


import xgboost as xgb

# Use "hist" for constructing the trees, with early stopping enabled.
print("definition of my classifier")
# clf_cpu = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2, random_state= 1984, seed= 1984, n_jobs= 16) # this takes about 16 min with 10 cores
clf_gpu = xgb.XGBClassifier(device= "cuda", tree_method="gpu_hist", early_stopping_rounds=2, random_state= 1984, seed= 1984) # n_gpus= -1,

# Fit the model, test sets are used for early stopping.
print("fitting my classifier to my data-set")
# clf_cpu.fit(X_train_cudf, y_train_cudf, eval_set=[(X_test_cudf, y_test_cudf)])
clf_gpu.fit(X_train, y_train, eval_set=[(X_test, y_test)])


# ::::::::::::::::::::::::::::::::::::::::::::::::::


In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::: trying XGBoost for classification ::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::
#         some hyper parameter optimzation
# ::::::::::::::::::::::::::::::::::::::::::::::::::

from cuml.model_selection import GridSearchCV as cuml_Grid
'''params = {
    'learning_rate': [0.03, 0.01],
    'min_child_weight': [1,3, 5,7, 10],
    'gamma': [0, 0.5, 1, 1.5, 2, 2.5, 5],
    'subsample': [0.6, 0.8, 1.0, 1.2, 1.4],
    'colsample_bytree': [0.6, 0.8, 1.0, 1.2, 1.4],
    'max_depth': [3, 4, 5, 6, 7, 8, 9 ,10, 12, 14],
    'reg_lambda':np.array([0.4, 0.6, 0.8, 1, 1.2, 1.4])}'''

params= {"learning_rate": [0.03, 0.01]}

# # specific parameters. I set early stopping to avoid overfitting and specify the validation dataset 
fit_params = { 
    'early_stopping_rounds':10,
    'eval_set':[(X_test, y_test)]}

model_gpu= xgb.XGBClassifier(device= "cuda", tree_method= "gpu_hist", n_gpus= -1, predictor='gpu_predictor', verbose= 10, random_state= 1984, seed= 1984)

# # let's run the optimization
print("running an optimzation scheme based on RandomizedSearCV")
random_search = cuml_Grid(model_gpu, param_grid=params, scoring="precision", n_jobs=2,  verbose=10, cv=4)
random_search.fit(X_train,y_train, **fit_params)
print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", random_search.best_estimator_)
print("\n The best score across ALL searched params:\n", random_search.best_score_)
print("\n The best parameters across ALL searched params:\n", random_search.best_params_)
# :::::::::::::::::::::::::::::::::::

In [None]:
# visualization of theaccurazy of the trained XGBoost-model
from sklearn.metrics import confusion_matrix as cm


class_names = pd.Series(y_test).unique
# y_predict_cpu= clf_cpu.predict(X_test)
y_predict_gpu= clf_gpu.predict(X_test)
y_predict_gpu_opt= random_search.predict(X_test)

print("definition of the image to be generated")
fig, axn = plt.subplots(1,2, figsize= (18,8))

sns.heatmap(cm(y_test, y_predict_gpu_opt), annot= True, fmt= "g", ax= axn[0]) # cmap=plt.cm.Blues,
sns.heatmap(cm(y_test, y_predict_gpu), annot= True, fmt= "g", ax= axn[1])

[axn[i].set(xlabel= "prediction", ylabel= "truth") for i in range(2)]
plt.tight_layout()
axn[0].set_title('opt. GPU Model')
axn[1].set_title('GPU Model')
plt.show()
# :::::::::::::::::::::::::.

In [None]:
from cuml.metrics import accuracy_score
from cuml.dask.ensemble import RandomForestClassifier as cumlDaskRF
from cuml.ensemble import RandomForestClassifier as cumlRF

from cuml.pipeline import make_pipeline as cuml_pipe
from sklearn.model_selection import GridSearchCV as sk_Grid
from cuml.model_selection import GridSearchCV as cuml_Grid
from dask_ml.model_selection import GridSearchCV as dask_Grid
from dask_ml.model_selection import IncrementalSearchCV as dask_Inc

print("definition of the model")
cuml_model= cumlDaskRF(n_estimators=300, n_streams=1, random_state= 1984) # (max_depth=max_depth, n_estimators=n_trees, n_bins=n_bins, n_streams=n_streams)
# cuml_model= cumlRF(n_estimator= 300, n_stream= 1, random_state= 1984)

params= {"n_estimators": [100, 200, 300, 400, 600, 1000]}
cuml_model= dask_Grid(estimator= cumlRF(n_streams= 1, random_state= 1984), param_grid= params , cv= 5, scoring= "r2") # {"n_estimators": [100, 200, 300, 400, 600, 1000]}
# cuml_model= dask_Inc(estimator= cumlRF(n_streams= 1, random_state= 1984), parameters= params, scoring= "r2")

# :::::::::::::::::::::::::::::::::::
from dask_ml.wrappers import Incremental
cuml_model= Incremental(cuml_model, scoring= "accuracy")



# :::::::::::::::::::::::::::::::::::
# :::::::::::::::::::::::::::::::::::
# ## with joblib
'''print("fitting the model to my data-set by means of using joblib")
from joblib import parallel_backend
gs = cuml_Grid(
    cuml_model,
    params,
    cv= 5,
    scoring='accuracy',
    )
with parallel_backend('dask'):
    gs.fit(X_train, y_train)'''
# :::::::::::::::::::::::::::::::::::
# :::::::::::::::::::::::::::::::::::


print("fitting the model to my data-set")
cuml_model.fit(X_train_dask, y_train_dask)
# cuml_model.fir(X_train_dask, y_train_dask)
# wait(cuml_model.rfs) # Allow asynchronous training tasks to finish


# testing the model
print("validating the model....")
cuml_y_pred = cuml_model.predict(X_test_dask).compute().to_numpy()

# Due to randomness in the algorithm, you may see slight variation in accuracies
print(f"CuML accuracy:  {accuracy_score(y_test, cuml_y_pred):.3f}")


In [None]:
from dask_ml.linear_model import LogisticRegression as dask_log
# dask_Grid(estimator= dask_log(), param_grid= {} , cv= 5, scoring= "r2") #.fit(X_train_dask, y_train_dask)


dask_log.fit(X_train_dask, y_train_dask)


In [None]:
# %%timeit

from dask_ml.preprocessing import StandardScaler as daskStandardScaler
from cuml.preprocessing import StandardScaler as cumlStandardScaler
from cuml.ensemble import RandomForestClassifier as cumlDaskRF
from cuml.dask.ensemble import RandomForestClassifier as daskRF
from cuml.pipeline import make_pipeline as cuml_pipe

from sklearn.pipeline import make_pipeline as sk_pipe
from sklearn.preprocessing import StandardScaler as sk_Scaler
from sklearn.ensemble import RandomForestClassifier as sk_RF

print("defining my pipeline...")
cuml_pipe= cuml_pipe(GarbageCollector(), cumlStandardScaler(), cumlDaskRF(n_estimators= 300, n_streams= 8, random_state= 1984)) # cumlDaskRF(n_estimators= 300, n_streams= 1, random_state= 1984)
sk_pipe= sk_pipe(sk_Scaler(), sk_RF(n_estimators= 300, random_state= 1984))

pipe= cuml_pipe

from joblib import parallel_backend
print("parallel computing by using joblib")
with parallel_backend(backend= 'dask', workers= workers):
    # pipe.fit(X_train, y_train)
    pipe.fit(X_train, y_train)

# pipe.fit(X_train, y_train) # running this single command will do the same as joblib.parallel_backend

'''clf= cumlDaskRF(n_estimators= 300, random_state= 1984, n_streams= 8)
pipe= clf.fit(X_train_dask, y_train_dask) # this works with both GPUs at the same time, but without pipeline'''


# testing the model
print("validating the model....")
cuml_y_pred = pipe.predict(X_test) # .compute().to_numpy()
# cuml_y_predict= pipe.predict(X_test_dask).compute().to_numpy()
# Due to randomness in the algorithm, you may see slight variation in accuracies
print(f"CuML accuracy:  {accuracy_score(y_test, cuml_y_pred):.3f}")


# ::::::::::::::::::::::::::::

In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# definition of the classifiers to be used with CUDA
# ::::::::::::::::::::::::::::::::::::::::::::::::::
#   this attempt needs two Nvidia Volta Series GPU
# ::::::::::::::::::::::::::::::::::::::::::::::::::

from cuml.dask.naive_bayes import MultinomialNB as cumlDaskNB
from cuml.dask.ensemble import RandomForestClassifier as cumlDaskRF
from cuml.dask.linear_model import LogisticRegression as cumlDaskLR
from dask_ml.ensemble import BlockwiseVotingClassifier as daskVoting


model_params_dask={
    "log_reg": {"model": cumlDaskLR(),
           "params": {}},
    "naive_bias": {"model": cumlDaskNB(),
                  "params": {}},
    "random_forest": {"model": cumlDaskRF(n_streams=n_streams, random_state= 1984),
                "params": {"randomforestclassiffier__n_estimators": [100, 200, 300, 500, 1000]
                               }},
    "voting": {"model": daskVoting(estimator= cumlDaskLR()),
              "params": {"blockwisevotingclassifier__voting": ['hard', 'soft']}}
}


import torch
print("total available CUDA devices: ", torch.cuda.device_count())
print("active CUDA divice: ", torch.cuda.get_device_name())

print("::::::::::::::::::::::::::::::::")
print(":::::::::  finished  :::::::::::")
print("::::::::::::::::::::::::::::::::")

In [None]:
model_params_dask["random_forest"]["params"] # .items()

In [None]:
for algo, mp in model_params_dask.items():
    print(mp["params"])

In [None]:
from cuml.dask.linear_model import LogisticRegression as daskLog
from dask_ml.ensemble import BlockwiseVotingClassifier as daskVoting


cumlDaskRF(n_streams=n_streams, random_state= 1984)

In [None]:
clf_dask

In [None]:
from cuml.pipeline import make_pipeline as cumake_pipeline
from dask_ml.model_selection import GridSearchCV as daskGridSearchCV
from dask_ml.preprocessing import StandardScaler as daskStandardScaler
from dask_ml.linear_model import LogisticRegression as daskLog
# from cuml.dask.preprocessing import RobustScaler as daskStandardScaler

test_model= "log_reg"
print("selected model= ", test_model)

print("loading the model")
cuml_model= model_params_dask[test_model]["model"]

print("definition of the classier..")
params= model_params_dask[test_model]["params"]
clf_dask= cumake_pipeline(GarbageCollector(), daskStandardScaler(), cuml_model)
clf_cv= daskGridSearchCV(daskLog, param_grid= {}, cv=5, return_train_score=False, scoring='r2')
# clf_cv= daskGridSearchCV(clf_dask, params, cv=5, return_train_score=False, scoring='r2')

# print("scaling the DataFrame")
# X_train_scale= daskStandardScaler().fit_transform(X_train_dask)


print("fitting the model...")
clf_cv.fit(X_train_dask, y_train_dask) # .astype(np.float32) )
# cuml_model.fit(X_train_dask, y_train_dask)
# wait(clf_dask.rfs) # Allow asynchronous training tasks to finish

# print(clf_cv.best_score_, " and ", clf_cv.best_params_)

# SVC= clf_dask.named_steps["truncatedsvd"] # for pipeline


# testing the model
print("testing the models accuracy...")
cuml_y_pred = clf_cv.predict(X_test_dask).compute().to_numpy()
# Due to randomness in the algorithm, you may see slight variation in accuracies
print("CuML accuracy:     ", accuracy_score(y_test, cuml_y_pred))

# for pipeline
# cuml_y_pred = SVC.score(X_test_dask, y_test_dask).compute().to_numpy()

# Due to randomness in the algorithm, you may see slight variation in accuracies
# print("CuML accuracy:     ", cuml_y_pred)

In [None]:


'''model_params_dask={"XGBoost":{"model": daskXGB(n_streams=n_streams, random_state= 1984, device= "cuda"),
                              "params": {"n_estimators": [80, 100, 120, 150, 200]}}
                  }'''



'''model_params_dask={
    "naive_bias": {"model": daskNB(),
                  "params": {}},
    "XGBoost": {"model": daskXGB(n_streams=n_streams, random_state= 1984, deveice= "cuda"),
                "params": {"xgbclassifier__n_estimators": [80, 100, 120, 150, 200]
                               }},
    "lin_reg": {"model": daskLin(max_iter=5000, random_state= 1984),
               "params": {"linearregression__algorithm": ["eig", "SVD"]
                         }},
    "elastic": {"model": daskPR(max_iter=5000, random_state= 1984),
                "params": {"poissonregression__C": [0, 0.3, 0.5, 0.7, 1]
                          }}
}'''

from cuml.dask.ensemble import RandomForestClassifier as daskRF
model_params_dask={
    "random": {"model": daskRF(random_state= 1984, n_streams= n_streams),
                  "params": {"randomforestclassifier__n_estimators": [80, 100, 120, 150, 200]}}
}



In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# definition of the classifiers to be used with CUDA
# ::::::::::::::::::::::::::::::::::::::::::::::::::
#   this attempt needs a Nvidia Volta Series GPU
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::
#              refined hyper-parameters
# ::::::::::::::::::::::::::::::::::::::::::::::::::


model_params_dask={
    "svm": {"model": daskSVC(random_state= 1984),
           "params": {"truncatedsvd__n_components": [1,3,5,7,10],
                      "truncatedsvd__algorithm": ['tsqr', 'randomized']
                     }},
    "naive_bias": {"model": daskNB()},
                  "params": {}},
    "XGBoost": {"model": daskXGB(n_streams=n_streams, random_state= 1984),
                "params": {"xgbclassifier__n_estimators": [80, 100, 120, 150, 200]
                               }}
}



In [None]:
# X_train_dask.shape

# X_train_dask.shape

# X_train_dask.head(5) #.to_dask_array()

pipe_dask

# cp.asarray(y_train)

In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::::: looping through the classifiers ::::::::
# :::::::::    will run on multiple GPUs    ::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::


# generating a pipeline to identify the best performing classification
from timeit import default_timer as timer
from cuml.pipeline import make_pipeline as cumake_pipeline
from dask_ml.model_selection import GridSearchCV as daskGridSearchCV
from dask_ml.model_selection import RandomizedSearchCV as daskRandomizedCV
# from dask_ml.preprocessing import StandardScaler as daskStandardScaler
from dask_ml.preprocessing import MinMaxScaler as daskStandardScaler


print("starting the timer for measuring the compute time")
start= timer()

scores_dask= []
best_estimators_dask= {}

# transforming my training numpy array into a Dask DataFrame
# X_train_dask= dask.dataframe.from_array(X_train)
# dask_y_train= dask.dataframe.from_array(y_train)

# use "model_params" for CPU utilization
start= timer()
for algo, mp in model_params_dask.items():
    print(f"currently building a model based on {mp['model']} with the following paramters: \n {mp['params']}")
    # pipe_dask= cumake_pipeline(GarbageCollector(),daskStandardScaler(), mp["model"])
    # pipe_dask= cumake_pipeline(GarbageCollector(), mp["model"])
    pipe_dask= mp["model"]
    clf= daskGridSearchCV(pipe_dask, mp["params"], cv=5, return_train_score=False, scoring='r2', n_jobs= 4) # cv= 5, n_jobs=2
    # clf= daskGridSearchCV(daskRF(), param_grid= {"n_estimators": [50, 60, 80, 100, 120]}, cv=3, return_train_score=False, scoring='r2')
    # clf= daskXGB(n_estimators=200, n_streams=n_streams)
    clf.fit(X_train_dask , y_train_dask) # clf.fit(X_train_dask, y_train_dask.to_dask_array())
    wait(clf.rfs) # Allow asynchronous training tasks to finish
    scores_dask.append({"model": mp["model"],
        "best_score": clf.best_score_,
        "best_params": clf.best_params_})
    best_estimators_dask[algo]= clf.best_estimator_
end= timer()

print(":::::::::::::::::::::::::::::::")
print(f"time it took to find the best params by using the GPU: {(end-start)/60:.3f} minutes")
print(":::::::::::::::::::::::::::::::")

df_scores_dask= pd.DataFrame(scores_dask, columns=["model", "best_score", "best_params"])
df_scores_dask



In [None]:
import optuna

# taken from optuna web-site for initiation
def objective(trial):
    x = trial.suggest_float('x', -10, 10)
    return (x - 2) ** 2

study = optuna.create_study()
study.optimize(objective, n_trials=100)

study.best_params





In [None]:



# Use "hist" for constructing the trees, with early stopping enabled.
clf = daskXGB(device= "cuda", workers= workers, tree_method="gpu_hist", early_stopping_rounds=2, random_state= 1984)
# Fit the model, test sets are used for early stopping.
clf.fit(X_train_dask, y_train_dask) # , eval_set=[(X_test, y_test)]



In [None]:
%%time

import xgboost as xgb

# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2, random_state= 1984, n_jobs= 16) # this takes about 16 min with 10 cores
# Fit the model, test sets are used for early stopping.
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])



In [None]:
clf.get_xgb_params()

In [None]:
%%time
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# :::::::: quick testing if single RFC wortks ::::::
# :::::::::    will run on multiple GPUs    ::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::
from cuml.metrics import accuracy_score


cuml_model = daskXGB(n_estimators=200, n_streams=1)
cuml_model.fit(X_train_dask.to_dask_array(), y_train_dask)


wait(cuml_model.rfs) # Allow asynchronous training tasks to finish

cuml_y_pred = cuml_model.predict(X_test_dask).compute().to_numpy()

# Due to randomness in the algorithm, you may see slight variation in accuracies
print(f"CuML accuracy:  {accuracy_score(y_test, cuml_y_pred):.3f}")
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::

In [None]:
# ::::::::::::::::::::::::::::::::
# ::::: results based on GPU :::::
# ::::::::::::::::::::::::::::::::
print(f"score achived by applying SVC to the train DS: {best_estimators_cu['svm'].score(X_test_cu, y_test_cu):.3f}")
print(f"score achived by applying GNB to the train DS: {best_estimators_cu['naive_bias'].score(X_test_cu, y_test_cu):.3f}")
print(f"score achived by applying XGB to the train DS: {best_estimators_cu['XGBoost'].score(X_test_cu, y_test_cu):.3f}")



In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# :::::::::: generation of confusino matrix ::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::


from sklearn.metrics import confusion_matrix as cm
import seaborn as sns

_class=["svm", "svm_lin", "rand_forest", "log_reg"]
# defining an array with the classifiers containing their best estimators
clf= [best_estimators_cu[_class[i]] for i in range(4)]

# looping throught the different classiffiers
y_predict= [clf[i].predict(X_test_cu) for i in range(4)]


# generating confucion matrices for the different classifies
cm_PCB= [cm(y_test, y_predict[i].get()) for i in range(4)]

fig, axm= plt.subplots(2,2, figsize= (10,10), sharex=True, sharey=True)

sns.heatmap(cm_PCB[0], annot=True, ax= axm[0,0])
sns.heatmap(cm_PCB[1], annot=True, ax= axm[0,1])
sns.heatmap(cm_PCB[2], annot=True, ax= axm[1,0])
sns.heatmap(cm_PCB[3], annot=True, ax= axm[1,1])

for ax in axm.flat:
    # set common labels
    ax.set(xlabel= "prediction", ylabel= "truth")
    # Hide x labels and tick labels for top plots and y ticks for right plots.
    ax.label_outer()

fig.tight_layout()
plt.show()


In [None]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::
# ::::::::::: exporting my optimzed models :::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::

import pickle

# looping through all models defined and optimzed
[pickle.dump(clf[i], open(directory_settings+ _class[i]+"_dask_cv5_4bit_model.pkl", "wb")) for i in range(int(len(_class)))]


In [None]:
from numba import cuda

cuda.close()

In [None]:
#   !!! skip it for the time being !!!
''' !!! skip it for the time being !!! '''

# ::::::::::::::::::::::::::::::::::::::::::::::::::
# :::::::::: definition of the classifiers :::::::::
# ::::::::::::::::::::::::::::::::::::::::::::::::::

from timeit import default_timer as timer
from sklearn.model_selection import GridSearchCV as GridSearchCV
from sklearn.preprocessing import StandardScaler as StandardScaler
# ::::::::::::::::::::::::::::::::::::::::::::::::::
from joblib import parallel_backend
# ::::::::::::::::::::::::::::::::::::::::::::::::::
scores= []
best_estimators= {}

# use "model_params" for CPU utilization
start= timer()
with parallel_backend('threading', n_jobs=2):
    for algo, mp in model_params.items():
        print(f"currently building a model based on {mp['model']} with the following paramters: \n {mp['params']}")
        pipe= make_pipeline(StandardScaler(), mp["model"])
        clf= GridSearchCV(pipe, mp["params"], cv=5, return_train_score=False)
        clf.fit(X_train, y_train)
        scores.append({"model": mp["model"],
            "best_score": clf.best_score_,
            "best_params": clf.best_params_})
        best_estimators[algo]= clf.best_estimator_
end= timer()

print(":::::::::::::::::::::::::::::::")
print(f"time it took to find the best params by using the CPU: {(end-start):.3f}")
print(":::::::::::::::::::::::::::::::")

df_scores= pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df