In [1]:
import time
import datetime as dt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn import datasets

import warnings
warnings.filterwarnings("ignore")

## Working with __IRIS__ dataset

In [2]:
def save_clusters_iris(num_clusters):
    iris = datasets.load_iris()
#     display(iris)
    X = iris.data
    y = iris.target
#     print(X)
#     print(y)
    # KMeans with 3 clusters
    clf =  KMeans(n_clusters=num_clusters)
    clf.fit(X,y)

    #Coordinates of cluster centers with shape [n_clusters, n_features]
    clf.cluster_centers_
    #Labels of each point
    clf.labels_

    # Nice Pythonic way to get the indices of the points for each corresponding cluster
    mydict = {i: np.where(clf.labels_ == i)[0] for i in range(clf.n_clusters)}
    return mydict

In [3]:
save_clusters_iris(4)

{0: array([100, 102, 103, 104, 105, 107, 108, 109, 112, 115, 116, 117, 118,
        120, 122, 124, 125, 128, 129, 130, 131, 132, 135, 136, 137, 139,
        140, 141, 143, 144, 145, 148]),
 1: array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 2: array([ 50,  51,  52,  54,  56,  58,  63,  65,  68,  70,  72,  73,  74,
         75,  76,  77,  78,  83,  85,  86,  87,  91,  97, 101, 110, 111,
        113, 114, 119, 121, 123, 126, 127, 133, 134, 138, 142, 146, 147,
        149]),
 3: array([ 53,  55,  57,  59,  60,  61,  62,  64,  66,  67,  69,  71,  79,
         80,  81,  82,  84,  88,  89,  90,  92,  93,  94,  95,  96,  98,
         99, 106])}

## Working with __Earthquake__ dataset

In [4]:
main_df = pd.read_csv('eq_database_place.csv')
dummy_eq = main_df.copy()

In [5]:
dummy_df = dummy_eq[dummy_eq['Place'].str.contains('IN')]
dummy_df.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status,Place
58,02/18/1965,04:26:37,25.011,94.186,Earthquake,55.0,,,5.6,MW,...,,,,,ISCGEM860307,ISCGEM,ISCGEM,ISCGEM,Automatic,"State of Manipur, IN"
402,04/04/1966,06:42:13,12.01,92.483,Earthquake,10.0,,,5.8,MW,...,,,,,ISCGEM848084,ISCGEM,ISCGEM,ISCGEM,Automatic,Union Territory of Andaman and Nicobar Islands...
468,06/27/1966,10:59:19,29.706,80.935,Earthquake,25.0,,,6.3,MW,...,,,,,ISCGEM846871,ISCGEM,ISCGEM,ISCGEM,Automatic,"Dharchula, IN"
494,08/15/1966,02:15:32,28.565,78.961,Earthquake,27.0,,,5.6,MW,...,,,,,ISCGEM844491,ISCGEM,ISCGEM,ISCGEM,Automatic,"Shahabad, IN"
528,09/26/1966,05:10:57,27.419,92.576,Earthquake,14.4,,,5.9,MW,...,,,,,ISCGEM843878,ISCGEM,ISCGEM,ISCGEM,Automatic,"Bomdila, IN"


## NaN removing function

In [6]:
def nan_helper(y):
    """
    Helper to handle indices and logical indices of NaNs.
    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """
    
    return np.isnan(y), lambda z: z.nonzero()[0]

## String label encoding

In [7]:
def label_integer_encoder(my_df, series_name):
    """
    This function is encoding values of a series
    Parameter
    ---------
    * `my_df`: Pandas dataframe
    * `series_name`: Pandas series name to encode
    Returns : a encoded array
    """
    arr_name = np.array(list(my_df[str(series_name)]))
    label_arr_encoder = LabelEncoder()
    integer_arr_encoded = label_arr_encoder.fit_transform(arr_name)
    
    return integer_arr_encoded

## Interpolation function

In [8]:
def get_interpolation(my_df, nan_series):
    arr_series = np.array(my_df[str(nan_series)])
    nans, x = nan_helper(arr_series)
    arr_series[nans] = np.interp(x(nans), x(~nans), arr_series[~nans])
    return arr_series.round(2)

## Removing NaN values from the series

In [9]:
dummy_df['Depth Error'] = get_interpolation(dummy_df, 'Depth Error')
dummy_df['Depth Seismic Stations'] = get_interpolation(dummy_df, 'Depth Seismic Stations')
dummy_df['Magnitude Error'] = get_interpolation(dummy_df, 'Magnitude Error')
dummy_df['Magnitude Seismic Stations'] = get_interpolation(dummy_df, 'Magnitude Seismic Stations')
dummy_df['Azimuthal Gap'] = get_interpolation(dummy_df, 'Azimuthal Gap')
dummy_df['Horizontal Distance'] = get_interpolation(dummy_df, 'Horizontal Distance')
dummy_df['Horizontal Error'] = get_interpolation(dummy_df, 'Horizontal Error')
dummy_df['Root Mean Square'] = get_interpolation(dummy_df, 'Root Mean Square')

## Actual encoding of strings

In [10]:
dummy_df['Type'] = label_integer_encoder(dummy_df, 'Type')
dummy_df['Magnitude Type'] = label_integer_encoder(dummy_df, 'Magnitude Type')
dummy_df['Place'] = label_integer_encoder(dummy_df, 'Place')
dummy_df['Status'] = label_integer_encoder(dummy_df, 'Status')

## Droping unwanted

In [11]:
dummy_df = dummy_df.drop(['ID', 'Source', 'Location Source', 'Magnitude Source'], axis=1)

## Time object numerical values

In [12]:
timestamp = []
for d, t in zip(dummy_df['Date'], dummy_df['Time']):
    try:
        ts = dt.datetime.strptime(d + ' ' + t, '%m/%d/%Y %H:%M:%S')
        timestamp.append(time.mktime(ts.timetuple())) # inverse funtion of localtime
    except ValueError as e:
        timestamp.append('ValueError')

time_s = pd.Series(timestamp)
dummy_df['TimeStamp'] = time_s.values
dummy_df = dummy_df.drop(['Date', 'Time'], axis=1)

In [13]:
dummy_df.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,Magnitude Error,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,Status,Place,TimeStamp
58,25.011,94.186,0,55.0,3.8,68.0,5.6,2,0.05,61.0,37.6,2.55,7.3,1.0,0,30,-153623003.0
402,12.01,92.483,0,10.0,3.8,68.0,5.8,2,0.05,61.0,37.6,2.55,7.3,1.0,0,35,-118190867.0
468,29.706,80.935,0,25.0,3.8,68.0,6.3,2,0.05,61.0,37.6,2.55,7.3,1.0,0,9,-110917841.0
494,28.565,78.961,0,27.0,3.8,68.0,5.6,2,0.05,61.0,37.6,2.55,7.3,1.0,0,26,-106715668.0
528,27.419,92.576,0,14.4,3.8,68.0,5.9,2,0.05,61.0,37.6,2.55,7.3,1.0,0,7,-103076343.0


In [14]:
def get_data_clusters(my_df, num_clusters):
    clf_km = KMeans(n_clusters=num_clusters).fit(my_df)
    # clf_km.fit(X, y)
    # clf_km.cluster_centers_
    # clf_km.labels_
    unique_clusters = {i: np.where(clf_km.labels_ == i)[0] for i in range(clf_km.n_clusters)}
    return unique_clusters

In [15]:
quake_zones = get_data_clusters(dummy_df, 8)
print(len(quake_zones))

8


In [16]:
q_vals = dummy_df.values
q_z = {}
for i, j in quake_zones.items():
    z_lls = []
    for v in list(j):
        s_ll = (q_vals[v][0], q_vals[v][1], q_vals[v][6], q_vals[v][3])
        z_lls.append(s_ll)
    q_z[i] = z_lls

In [17]:
q_z

{0: [(10.652999999999999, 92.361, 6.2, 24.0),
  (12.99, 92.525, 5.5, 30.0),
  (10.413, 92.869, 5.9, 34.0),
  (11.005, 91.824, 6.6, 17.0),
  (10.894, 91.777, 5.9, 17.0),
  (10.847000000000001, 91.712, 6.1, 15.0),
  (11.055, 91.814, 6.2, 20.0),
  (10.947000000000001, 91.711, 5.7, 13.0),
  (10.833, 91.711, 5.5, 26.7),
  (34.203, 73.9, 5.5, 12.0),
  (11.132, 93.471, 6.5, 112.0)],
 1: [(25.976999999999997, 95.34, 7.0, 76.1),
  (26.469, 93.12700000000001, 5.5, 35.0),
  (25.206, 94.708, 5.7, 62.3),
  (24.281, 93.54799999999999, 5.9, 30.0),
  (10.686, 92.583, 5.7, 46.0),
  (11.43, 92.346, 6.0, 25.0),
  (11.446, 92.35700000000001, 6.1, 33.0)],
 2: [(24.256999999999998, 93.545, 6.0, 33.0),
  (24.641, 92.891, 6.0, 22.6),
  (32.128, 76.374, 5.5, 33.0),
  (31.049, 77.997, 5.5, 33.0),
  (25.271, 94.20200000000001, 6.3, 49.5),
  (25.149, 95.12700000000001, 7.3, 90.5),
  (25.17, 94.652, 5.6, 66.5),
  (24.471999999999998, 92.505, 5.5, 33.0),
  (30.78, 78.774, 6.8, 10.3),
  (24.015, 93.986, 5.5, 72.1)],