<a href="https://colab.research.google.com/github/MapleWolfe/Milestone_2/blob/main/DBscan_attempt01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## installs, imports, pre-sets

In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py
!pip install google-cloud-storage

In [7]:
#google import options
#from google.colab import drive
from google.cloud import storage

#general usage imports
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import gc
import os
import multiprocessing
import pickle
import json
import joblib

#model operations imports
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#sklearn classifiers
from sklearn.linear_model import SGDClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

#GPU imports
import cudf
from cuml.naive_bayes import GaussianNB
from cuml.naive_bayes import ComplementNB
from cuml import LogisticRegression
from cuml.ensemble import RandomForestClassifier
from cuml.dask.cluster import DBSCAN

import cupy
import xgboost as xgb



## GCP set up

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/organic-reef-390716-609989a4c6da.json'
client = storage.Client()
bucket = client.get_bucket('fire_train_eval_test_bucket')
blob = bucket.blob('test.csv')
blob.download_to_filename('test.csv')
blob = bucket.blob('eval.csv')
blob.download_to_filename('eval.csv')
blob = bucket.blob('train.csv')
blob.download_to_filename('train.csv')

## loading unsupervised models

In [3]:
#standard_scalar_model
with open('/content/standard_scalar_model', 'rb') as ss_file:
    loaded_scalar_model = pickle.load(ss_file)

# pca model chosen:
with open('/content/pca_model_8', 'rb') as pca_file:
    loaded_pca_model = pickle.load(pca_file)

#kmeans model chosen:
with open('/content/kmean_model_1', 'rb') as kmean_file:
    loaded_kmean_model = pickle.load(kmean_file)

## Functions to load and clean csv chunks

In [4]:
#remember to add .csv at the end of file name
def read_csv_in_chunks(file_name,number_images):

  #number of rows per image:
  pixels_count = 64*64

  #upto 200 images at a time
  size = number_images*pixels_count

  #file string and location for Google cloud storage
  file_string = '/content/' + file_name
  return pd.read_csv(file_string, chunksize=size)

def read_full_csv(file_name):

  #file string and location for Google cloud storage
  file_string = '/content/' + file_name


  return pd.read_csv(file_string)

#this is procedure that cleans the data.
# cleaner_1 drops all negative "firemask" values and converts all values above 0 to 1
def cleaner_1(df_chunk):
  col_list = ['NDVI_scaled_smoothened_values', 'NDVI_local_gradient', 'NDVI_local_mean', 'tmmn_scaled_smoothened_values', 'tmmn_local_gradient', 'tmmn_local_mean', 'elevation_scaled_smoothened_values', 'elevation_local_gradient', 'elevation_local_mean', 'fire_at_similar_altitude', 'population_scaled_smoothened_values', 'population_local_gradient', 'population_local_mean', 'vs_scaled_smoothened_values', 'vs_local_gradient', 'vs_local_mean', 'pdsi_scaled_smoothened_values', 'pdsi_local_gradient', 'pdsi_local_mean', 'pr_scaled_smoothened_values', 'pr_local_gradient', 'pr_local_mean', 'tmmx_scaled_smoothened_values', 'tmmx_local_gradient', 'tmmx_local_mean', 'sph_scaled_smoothened_values', 'sph_local_gradient', 'sph_local_mean', 'th_scaled_smoothened_values', 'th_local_gradient', 'th_local_mean', 'distance_from_fire', 'erc_scaled_smoothened_values', 'erc_local_gradient', 'erc_local_mean']

  original_previous_day_fire = df_chunk['PrevFireMask']
  original_next_day_fire = df_chunk['FireMask']

  #general cleaning for classifier and regressor
  drop_neg_df = df_chunk[df_chunk['FireMask'] >=0]

  #only regressor selection
  regressor_target = drop_neg_df['FireMask']

  #cleaning specifically for the classifier
  classifier_target = np.where(regressor_target > 0, 1, 0)
  dropped_chunk = drop_neg_df.drop(labels=['PrevFireMask','FireMask','image_id'], axis=1)
  output_chunk = dropped_chunk[col_list]
  return output_chunk,regressor_target,classifier_target, original_previous_day_fire, original_next_day_fire


## setting up data

In [5]:
#train
train_df = read_full_csv('train.csv')
print('train loaded')
train_cleaned_df,train_regressor_target,train_classifier_target, train_original_previous_day_fire, train_original_next_day_fire = cleaner_1(train_df)
del train_df
gc.collect()

print('initializing train data scaling')
train_data_scaled = loaded_scalar_model.transform(train_cleaned_df)
del train_cleaned_df
gc.collect()

print('initializing train pca')
train_data_pca = loaded_pca_model.transform(train_data_scaled)
del train_data_scaled
gc.collect()


#evaluation
eval_df = read_full_csv('eval.csv')
print('eval loaded')
eval_cleaned_df,eval_regressor_target,eval_classifier_target, eval_original_previous_day_fire, eval_original_next_day_fire = cleaner_1(eval_df)
del eval_df
gc.collect()

print('initializing eval data scaling')
eval_data_scaled = loaded_scalar_model.transform(eval_cleaned_df)
del eval_cleaned_df
gc.collect()

print('initializing eval pca')
eval_data_pca = loaded_pca_model.transform(eval_data_scaled)
del eval_data_scaled
gc.collect()


#test
test_df = read_full_csv('test.csv')
print('test loaded')
test_cleaned_df,test_regressor_target,test_classifier_target, test_original_previous_day_fire, test_original_next_day_fire = cleaner_1(test_df)
del test_df
gc.collect()

print('initializing test data scaling')
test_data_scaled = loaded_scalar_model.transform(test_cleaned_df)
del test_cleaned_df
gc.collect()

print('initializing test pca')
test_data_pca = loaded_pca_model.transform(test_data_scaled)
del test_data_scaled
gc.collect()


train loaded
initializing train data scaling
initializing train pca
eval loaded
initializing eval data scaling
initializing eval pca
test loaded
initializing test data scaling
initializing test pca


0

## DBSCAN

In [8]:
 DBSCAN(min_samples = 2048000)

MemoryError: ignored

## HDBSCAN

## Agglomerative clustering