<a href="https://colab.research.google.com/github/MapleWolfe/Milestone_2/blob/Jai/ML_part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised and Supervised Learning techniques

## installs, imports, pre-sets

In [1]:
#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
#!python rapidsai-csp-utils/colab/pip-install.py

In [2]:
#general usage imports
from google.colab import drive
import tensorflow as tf
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import gc

#clustering import
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.datasets import make_blobs
from sklearn.model_selection import ParameterGrid

# GPU Imports below
#import cudf
#import cupy as cp


## Functions to load csv chunks

In [3]:
# let's mount the drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#remember to add .csv at the end of file name
def read_csv_in_chunks(file_name,number_images):

  #number of rows per image:
  pixels_count = 64*64

  #upto 200 images at a time
  size = number_images*pixels_count

  #file string and location
  file_string = '/content/drive/MyDrive/' + file_name

  return pd.read_csv(file_string, chunksize=size)

def read_full_csv(file_name):
  #file string and location
  file_string = '/content/drive/MyDrive/' + file_name

  return pd.read_csv(file_string)

## Functions to clean CSV chunks

In [5]:
#this is procedure that cleans the data.
# cleaner_1 drops all negative "firemask" values and converts all values above 0 to 1
def cleaner_1(df_chunk):
  df_chunk['PrevFireMask'] = df_chunk['PrevFireMask'].astype(float)
  df_chunk['FireMask'] = df_chunk['FireMask'].astype(float)

  original_previous_day_fire = df_chunk['PrevFireMask']
  original_next_day_fire = df_chunk['FireMask']

  #general cleaning for classifier and regressor
  drop_neg_df = df_chunk[df_chunk['FireMask'] >=0]

  #only regressor selection
  regressor_target = drop_neg_df['FireMask']

  #cleaning specifically for the classifier
  classifier_target = np.where(regressor_target > 0, 1, 0)
  output_chunk = df_chunk.drop(labels=['PrevFireMask','FireMask','image_id'], axis=1)

  return output_chunk,regressor_target,classifier_target, original_previous_day_fire, original_next_day_fire,

# cleaner_2 drops all values where "Firemask" is not 0 or 1 (only classifer)
def cleaner_2(df_chunk):
  df_chunk['PrevFireMask'] = df_chunk['PrevFireMask'].astype(float)
  df_chunk['FireMask'] = df_chunk['FireMask'].astype(float)

  original_previous_day_fire = df_chunk['PrevFireMask']
  original_next_day_fire = df_chunk['FireMask']

  #general cleaning for classifier
  binary_value_df = df_chunk[(df_chunk['FireMask'] ==0)|(df_chunk['FireMask'] ==1)]

  #cleaning specifically for the classifier
  classifier_target = binary_value_df['FireMask']
  output_chunk = binary_value_df.drop(labels=['PrevFireMask','FireMask','image_id'], axis=1)
  return output_chunk,classifier_target, original_previous_day_fire, original_next_day_fire,

# cleaner_3 drops all values where "Firemask" and 'PrevFireMask' is not 0 or 1 (only classifier)
def cleaner_3(df_chunk):
  df_chunk['PrevFireMask'] = df_chunk['PrevFireMask'].astype(float)
  df_chunk['FireMask'] = df_chunk['FireMask'].astype(float)

  original_previous_day_fire = df_chunk['PrevFireMask']
  original_next_day_fire = df_chunk['FireMask']

  #general cleaning for classifier
  binary_value_df = df_chunk[(df_chunk['FireMask'] ==0)|(df_chunk['FireMask'] ==1)]
  fully_filtered_df = binary_value_df[(binary_value_df['PrevFireMask'] ==0)|(binary_value_df['PrevFireMask'] ==1)]

  #cleaning specifically for the classifier
  classifier_target = fully_filtered_df['FireMask']
  output_chunk = fully_filtered_df.drop(labels=['PrevFireMask','FireMask','image_id'], axis=1)
  return output_chunk,classifier_target, original_previous_day_fire, original_next_day_fire,

## Unsupervised Learning

#### cluster evaluation functions

In [6]:
def cluster_evaluation(eval_df, cluster_model):
    print('evaluation start')
    inertia = cluster_model.inertia_
    silhouette = silhouette_score(eval_df, cluster_model.labels_)
    calinski = calinski_harabasz_score(eval_df, cluster_model.labels_)
    davies_bouldin = davies_bouldin_score(eval_df, cluster_model.labels_)
    print('evaluation complete')
    return inertia, silhouette, calinski, davies_bouldin

#### Kmeans Clustering

In [7]:
# lets build a function for our kmeans cluster
def search_params_kmeans(cleaner_func=1,file_name='train.csv',number_images=200,cluster_list=[2,8,32,64],initialisation_list = ['k-means++', 'random']):
  k_means_param_grid = {'n_clusters': cluster_list, 'init': initialisation_list}

  for params in ParameterGrid(k_means_param_grid):
    print('initializing kmeans for param: ', params)
    csv_chunks_generator = read_csv_in_chunks(file_name,number_images)
    K_means_model = MiniBatchKMeans(**params)
    counter = 0
    for a_chunk in csv_chunks_generator:
      if cleaner_func == 1:
        features_df,_,_,_ = cleaner_1(a_chunk)
      if cleaner_func == 2:
        features_df,_,_,_ = cleaner_2(a_chunk)
      if cleaner_func == 3:
        features_df,_,_,_ = cleaner_3(a_chunk)

      K_means_model.partial_fit(features_df)
      print('iteration completed: ', counter)
      counter+=1
    yield K_means_model, params

In [8]:
%%time
# a dict to store model performance & eval csv file
kmeans_main_dict ={}
evaluation_df = read_full_csv('eval.csv')
# cleaner func is the method of data cleaning we are interested in from 'Functions to clean CSV chunks' section
for cleaning_function_number in [1,2,3]:
  if cleaning_function_number == 1:
    cleaned_eval,_,_,_ = cleaner_1(evaluation_df)
  if cleaning_function_number == 2:
    cleaned_eval,_,_,_ = cleaner_2(evaluation_df)
  if cleaning_function_number == 3:
    cleaned_eval,_,_,_ = cleaner_3(evaluation_df)

# initiating model building
  model_builders = search_params_kmeans(cleaner_func=cleaning_function_number)
  param_perform_list =[]

#this where a lot of time will go, it will iterate over each model across grid search
  for a_kmean_model, kmean_params in model_builders:
    inertia, silhouette, calinski, davies_bouldin = cluster_evaluation(cleaned_eval, a_kmean_model)
    param_perform_list.append([kmean_params,inertia, silhouette, calinski, davies_bouldin])

  kmeans_main_dict[cleaning_function_number] = param_perform_list

ValueError: ignored

#### Hdbscan clustering

#### PCA

#### Auto Encoder

## Supervised learning (Classifier)

#### logistic classifier

#### Naive Bayes classifier

#### XGB Classifier

In [24]:
for a_chunk in read_csv_in_chunks('train.csv',1000):

0  :  454
1  :  1456
2  :  2003
3  :  2504
4  :  3507
5  :  4510
6  :  4824
7  :  5265
8  :  5676
9  :  5905
10  :  6323
11  :  7326
12  :  7782
13  :  8785
14  :  9262
