In [58]:
from pyspark import SparkContext
import datetime
from datetime import timedelta
from math import sin, cos, sqrt, atan2, radians

In [59]:
sc = SparkContext.getOrCreate()

# Insert The Data

In [60]:
dep_rdd = sc.textFile('datasets/DepremVerileri-2019-Nisan-1.txt')
dep_rdd.take(5)

['No    \tDeprem Kodu\tOlus tarihi\tOlus zamani\tEnlem\tBoylam\tDer(km)\txM\tMD\tML\tMw\tMs\tMb\tTip\tYer',
 '000001\t20190430145223\t2019.04.30\t14:52:23.77\t39.0710\t26.4953\t010.3\t3.6\t0.0\t3.6\t3.4\t0.0\t0.0\tKe\tMIDILLI ADASI (EGE DENIZI)',
 '000002\t20190429183950\t2019.04.29\t18:39:50.17\t39.3883\t26.3330\t010.2\t3.9\t0.0\t3.9\t3.6\t0.0\t0.0\tKe\tEDREMIT KORFEZI (EGE DENIZI)',
 '000003\t20190429180243\t2019.04.29\t18:02:43.27\t39.3995\t26.3192\t011.6\t4.5\t0.0\t4.5\t4.3\t0.0\t0.0\tKe\tEDREMIT KORFEZI (EGE DENIZI)',
 '000004\t20190429093510\t2019.04.29\t09:35:10.15\t35.7588\t27.1600\t005.8\t3.8\t0.0\t3.6\t3.8\t0.0\t0.0\tKe\tAKDENIZ']

## Removing the Header

In [61]:
dep_rdd = dep_rdd.filter(lambda line : "Deprem Kodu" not in line)
dep_rdd.take(1)

['000001\t20190430145223\t2019.04.30\t14:52:23.77\t39.0710\t26.4953\t010.3\t3.6\t0.0\t3.6\t3.4\t0.0\t0.0\tKe\tMIDILLI ADASI (EGE DENIZI)']

# Parsing the Data

In [62]:
def splitAndSelect(line):
    arr = line.split("\t")
    #date_time_str = arr[2] + ' ' + arr[3]
    #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y.%m.%d %H:%M:%S.%f')
    tarih = arr[2]
    zaman = arr[3]
    lng = arr[4]
    lat = arr[5]
    buyukluk = float(arr[7])
    yer = arr[14]
    return (yer, buyukluk, lng, lat, tarih, zaman)

In [63]:
dep_rdd = dep_rdd.map(lambda line : splitAndSelect(line))
dep_rdd.take(5)

[('MIDILLI ADASI (EGE DENIZI)',
  3.6,
  '39.0710',
  '26.4953',
  '2019.04.30',
  '14:52:23.77'),
 ('EDREMIT KORFEZI (EGE DENIZI)',
  3.9,
  '39.3883',
  '26.3330',
  '2019.04.29',
  '18:39:50.17'),
 ('EDREMIT KORFEZI (EGE DENIZI)',
  4.5,
  '39.3995',
  '26.3192',
  '2019.04.29',
  '18:02:43.27'),
 ('AKDENIZ', 3.8, '35.7588', '27.1600', '2019.04.29', '09:35:10.15'),
 ('AKDENIZ', 5.0, '35.3045', '27.7108', '2019.04.29', '04:43:15.42')]

## Sorting by Earthquake Intensity

In [64]:
dep_rdd = dep_rdd.sortBy(lambda x:x[1], ascending=False)
dep_rdd.take(5)

[('KURUTILEK- (ERZINCAN) [North East  3.0 km]',
  7.9,
  '39.8000',
  '39.5100',
  '1939.12.26',
  '23:57:20.90'),
 ('ONIKI ADALAR (AKDENIZ)',
  7.7,
  '36.5400',
  '27.3300',
  '1926.06.26',
  '19:46:38.80'),
 ('TÜRKIYE-IRAN SINIR BÖLGESI',
  7.6,
  '37.9800',
  '44.4800',
  '1930.05.06',
  '22:34:31.70'),
 ('YENIYAKA-CALDIRAN (VAN) [South East  1.9 km]',
  7.5,
  '39.0500',
  '44.0400',
  '1976.11.24',
  '12:22:16.00'),
 ('BASISKELE (KOCAELI) [North East  2.0 km]',
  7.4,
  '40.7600',
  '29.9700',
  '1999.08.17',
  '00:01:37.60')]

## Turning Dates to Tuple

In [65]:
def date_maker(arr):
    date = arr[4].split('.')
    time = arr[5].split(':')
    
    year = int(date[0])
    mounth = int(date[1])
    day = int(date[2])
    
    hour = int(time[0])
    minu = int(time[1])
    sec = int(round(float(time[2])))
    
    if mounth>12:
        mounth=12
    elif mounth<1:
        mounth =1
    
    if   day>31:
         day=31
    elif day<1:
         day =1
            
    if   hour>23:
         hour=23
    elif hour<0:
         hour =0
    
    if   minu>59:
         minu=59
    elif minu<0:
         minu =0
            
    if   sec>59:
         sec=59
    elif sec<0:
         sec=0
    
    return (arr[0], arr[1], arr[2], arr[3], (year, mounth, day), (hour, minu, sec))

In [66]:
dep_rdd_dt_1 = dep_rdd.map(lambda x: date_maker(x))
dep_rdd_dt_1.take(5)

[('KURUTILEK- (ERZINCAN) [North East  3.0 km]',
  7.9,
  '39.8000',
  '39.5100',
  (1939, 12, 26),
  (23, 57, 21)),
 ('ONIKI ADALAR (AKDENIZ)',
  7.7,
  '36.5400',
  '27.3300',
  (1926, 6, 26),
  (19, 46, 39)),
 ('TÜRKIYE-IRAN SINIR BÖLGESI',
  7.6,
  '37.9800',
  '44.4800',
  (1930, 5, 6),
  (22, 34, 32)),
 ('YENIYAKA-CALDIRAN (VAN) [South East  1.9 km]',
  7.5,
  '39.0500',
  '44.0400',
  (1976, 11, 24),
  (12, 22, 16)),
 ('BASISKELE (KOCAELI) [North East  2.0 km]',
  7.4,
  '40.7600',
  '29.9700',
  (1999, 8, 17),
  (0, 1, 38))]

## Filter Earthquakes Between 1990 and 2018

In [67]:
def between_90_18(deprem):
    y1 = str(deprem[4][0])
    m1 = str(deprem[4][1])
    d1 = str(deprem[4][2])
    h1 = str(deprem[5][0])
    mi1 =str(deprem[5][1])
    s1 = str(deprem[5][2])
    
    datetime_str_cur = y1 + '-' + m1  + '-' + d1 + ' ' + h1 + ':' + mi1 + ':' + s1
    datetime_obj_cur = datetime.datetime.strptime(datetime_str_cur, '%Y-%m-%d %H:%M:%S')
    
    datetime_str_90 = '1990-1-1 00:00:00'
    datetime_obj_90 =datetime.datetime.strptime(datetime_str_90, '%Y-%m-%d %H:%M:%S')
    
    datetime_str_18 = '2018-12-31 23:59:59'
    datetime_obj_18 =datetime.datetime.strptime(datetime_str_18, '%Y-%m-%d %H:%M:%S')
    
    if datetime_obj_cur >= datetime_obj_90 and datetime_obj_cur <= datetime_obj_18:
        return True
    else:
        return False
    

### Test the between_90_18 function

In [68]:
between_90_18(('BASISKELE (KOCAELI) [North East  2.0 km]',
  7.4,
  '40.7600',
  '29.9700',
  (1989, 8, 17),
  (0, 1, 38)))

False

In [69]:
dep_rdd_dt_1 = dep_rdd_dt_1.filter(lambda x : between_90_18(x) == True)
dep_rdd_dt_1.take(5)

[('BASISKELE (KOCAELI) [North East  2.0 km]',
  7.4,
  '40.7600',
  '29.9700',
  (1999, 8, 17),
  (0, 1, 38)),
 ('YEMLICE- (VAN) [North West  1.5 km]',
  7.2,
  '38.7212',
  '43.4110',
  (2011, 10, 23),
  (10, 41, 21)),
 ('UGUR- (DUZCE) [North East  0.3 km]',
  7.2,
  '40.7400',
  '31.2100',
  (1999, 11, 12),
  (16, 57, 21)),
 ('GUNEBAKAN- (ERZINCAN) [South West  1.7 km]',
  6.8,
  '39.7200',
  '39.6300',
  (1992, 3, 13),
  (17, 18, 39)),
 ('GOKOVA KORFEZI (AKDENIZ)',
  6.6,
  '36.9693',
  '27.4057',
  (2017, 7, 20),
  (22, 31, 10))]

## Geting the List of Top10 Earthquakes

In [70]:
top10 = dep_rdd_dt_1.take(10)
top10

[('BASISKELE (KOCAELI) [North East  2.0 km]',
  7.4,
  '40.7600',
  '29.9700',
  (1999, 8, 17),
  (0, 1, 38)),
 ('YEMLICE- (VAN) [North West  1.5 km]',
  7.2,
  '38.7212',
  '43.4110',
  (2011, 10, 23),
  (10, 41, 21)),
 ('UGUR- (DUZCE) [North East  0.3 km]',
  7.2,
  '40.7400',
  '31.2100',
  (1999, 11, 12),
  (16, 57, 21)),
 ('GUNEBAKAN- (ERZINCAN) [South West  1.7 km]',
  6.8,
  '39.7200',
  '39.6300',
  (1992, 3, 13),
  (17, 18, 39)),
 ('GOKOVA KORFEZI (AKDENIZ)',
  6.6,
  '36.9693',
  '27.4057',
  (2017, 7, 20),
  (22, 31, 10)),
 ('AKDENIZ', 6.4, '35.7948', '27.8798', (2008, 7, 15), (3, 26, 34)),
 ('KURTULUS- (BINGOL) [South West  4.3 km]',
  6.4,
  '39.0100',
  '40.4600',
  (2003, 5, 1),
  (0, 27, 4)),
 ('EGE DENIZI', 6.3, '38.8468', '26.3252', (2017, 6, 12), (12, 28, 38)),
 ('HASANBEYLI-SARICAM (ADANA) [South 10.1 km]',
  6.3,
  '36.9600',
  '35.5200',
  (1998, 6, 27),
  (13, 55, 52)),
 ('AKDENIZ', 6.2, '35.5138', '26.5798', (2011, 4, 1), (13, 29, 9))]

# Making Cartesian RDD


In [71]:
dep_rdd_dt_2 = dep_rdd_dt_1
dep_car_rdd = dep_rdd_dt_1.cartesian(dep_rdd_dt_2)
dep_car_rdd2 = dep_car_rdd.filter(lambda x: x[0][0] != x[1][0] and x[0][1] != x[1][1] and x[0][2] != x[1][2] \
                   and x[0][3] != x[1][3] and x[0][4] != x[1][4] and x[0][5] != x[1][5])
dep_car_rdd2.take(5)

[(('BASISKELE (KOCAELI) [North East  2.0 km]',
   7.4,
   '40.7600',
   '29.9700',
   (1999, 8, 17),
   (0, 1, 38)),
  ('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21))),
 (('BASISKELE (KOCAELI) [North East  2.0 km]',
   7.4,
   '40.7600',
   '29.9700',
   (1999, 8, 17),
   (0, 1, 38)),
  ('UGUR- (DUZCE) [North East  0.3 km]',
   7.2,
   '40.7400',
   '31.2100',
   (1999, 11, 12),
   (16, 57, 21))),
 (('BASISKELE (KOCAELI) [North East  2.0 km]',
   7.4,
   '40.7600',
   '29.9700',
   (1999, 8, 17),
   (0, 1, 38)),
  ('GUNEBAKAN- (ERZINCAN) [South West  1.7 km]',
   6.8,
   '39.7200',
   '39.6300',
   (1992, 3, 13),
   (17, 18, 39))),
 (('BASISKELE (KOCAELI) [North East  2.0 km]',
   7.4,
   '40.7600',
   '29.9700',
   (1999, 8, 17),
   (0, 1, 38)),
  ('GOKOVA KORFEZI (AKDENIZ)',
   6.6,
   '36.9693',
   '27.4057',
   (2017, 7, 20),
   (22, 31, 10))),
 (('BASISKELE (KOCAELI) [North East  2.0 km]',
   7.4,
   '40.7600',
   '29

## Distance Calculater

In [72]:
def cal_dist(loc_1, loc_2):
    R = 6373.0

    lat1 = radians(float(loc_1[0]))
    lon1 = radians(float(loc_1[1]))
    lat2 = radians(float(loc_2[0]))
    lon2 = radians(float(loc_2[1]))
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

## Date Converter and Calculate Wheather it is Foreshock or	Aftershock

In [73]:
def last_24H(dep_date1, dep_date2, loc1=None, loc2=None):
    y1 = str(dep_date1[0][0])
    m1 = str(dep_date1[0][1])
    d1 = str(dep_date1[0][2])
    h1 = str(dep_date1[1][0])
    mi1 = str(dep_date1[1][1])
    s1 = str(dep_date1[1][2])
    
    y2 = str(dep_date2[0][0])
    m2 = str(dep_date2[0][1])
    d2 = str(dep_date2[0][2])
    h2 = str(dep_date2[1][0])
    mi2 = str(dep_date2[1][1])
    s2 = str(dep_date2[1][2])
    
    #date_time_str = '2018-06-29 08:15:27.243860'
    date_time_str1 = y1 + '-' + m1  + '-' + d1 + ' ' + h1 + ':' + mi1 + ':' + s1
    date_time_obj1 = datetime.datetime.strptime(date_time_str1, '%Y-%m-%d %H:%M:%S')
    
    date_time_str2 = y2 + '-' + m2  + '-' + d2 + ' ' + h2 + ':' + mi2 + ':' + s2
    date_time_obj2 = datetime.datetime.strptime(date_time_str2, '%Y-%m-%d %H:%M:%S')
    
    if loc1 is not None and loc2 is not None:
        distance = cal_dist(loc1, loc2)
    
    
    if date_time_obj1 - timedelta(hours=24) <= date_time_obj2 <= date_time_obj1 + timedelta(hours=24):
        return True
    else:
        return False

### Testing the Function

In [74]:
first = ((1939, 12, 26), (7, 0, 0))
sec = ((1939, 12, 26), (23, 58, 0))
loc1 = ('40.6000', '27.2000')
loc2 = ('40.6000', '27.2100',)
last_24H(first, sec, loc1, loc2)

True

In [75]:
dep_car_rdd2.take(1)

[(('BASISKELE (KOCAELI) [North East  2.0 km]',
   7.4,
   '40.7600',
   '29.9700',
   (1999, 8, 17),
   (0, 1, 38)),
  ('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21)))]

In [76]:
x = (('KURUTILEK- (ERZINCAN) [North East  3.0 km]',
   7.9,
   '39.8000',
   '39.5100',
   (1939, 12, 26),
   (23, 57, 21)),
  ('TURNACAYIRI-CAYIRLI (ERZINCAN) [South West  4.5 km]',
   5.9,
   '39.8200',
   '39.7100',
   (1939, 11, 21),
   (8, 49, 3)))

print(last_24H((x[0][4],x[0][5]), (x[1][4],x[1][5]), (x[0][2],x[0][3]), (x[1][2],x[1][3])))
print((x[0][4],x[0][5]))
print((x[1][4],x[1][5]))
print((x[0][2],x[0][3]))
print((x[1][2],x[1][3]))

False
((1939, 12, 26), (23, 57, 21))
((1939, 11, 21), (8, 49, 3))
('39.8000', '39.5100')
('39.8200', '39.7100')


## Filtering Cartesians that's distance less than 20 km

In [77]:
t_rdd = dep_car_rdd2.filter(lambda x: cal_dist((x[0][2],x[0][3]), (x[1][2],x[1][3]))<=20)
t_rdd.take(1)

[(('BASISKELE (KOCAELI) [North East  2.0 km]',
   7.4,
   '40.7600',
   '29.9700',
   (1999, 8, 17),
   (0, 1, 38)),
  ('BAYRAKTAR-IZMIT (KOCAELI) [South East  2.0 km]',
   5.8,
   '40.7700',
   '30.1000',
   (1999, 9, 13),
   (11, 55, 29)))]

## Filtering Aftershocks and Foreshocks

In [78]:
t_rdd = t_rdd.filter(lambda x: last_24H((x[0][4],x[0][5]), (x[1][4],x[1][5])) == True)
t_rdd.take(5)

[(('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21)),
  ('KOZLUCA- (VAN) [North West  1.8 km]',
   4.9,
   '38.6693',
   '43.5113',
   (2011, 10, 24),
   (8, 49, 21))),
 (('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21)),
  ('BARDAKCI- (VAN) [North East  1.8 km]',
   4.8,
   '38.5747',
   '43.2832',
   (2011, 10, 24),
   (8, 28, 29))),
 (('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21)),
  ('DIBEKDUZU- (VAN) [North West  1.7 km]',
   4.7,
   '38.6512',
   '43.2258',
   (2011, 10, 24),
   (4, 43, 1))),
 (('UGUR- (DUZCE) [North East  0.3 km]',
   7.2,
   '40.7400',
   '31.2100',
   (1999, 11, 12),
   (16, 57, 21)),
  ('BAKACAK-GOLYAKA (DUZCE) [South East  2.4 km]',
   4.7,
   '40.7200',
   '31.0000',
   (1999, 11, 13),
   (0, 54, 0))),
 (('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.

# Geting The Top 10 Earthquake Aftershocks and Foreshocks

In [79]:
def top_10 (yer, top_ten):
    if yer in top_ten:
        return True
    else:
        return False

### Testing the Top_10 Function

In [80]:
x = (('TÜRKIYE-IRAN SINIR BÖLGESI',
   7.6,
   '37.9800',
   '44.4800',
   (1930, 5, 6),
   (22, 34, 32)),
  ('KIZILCA-BASKALE (VAN) [South East  20.7 km]',
   5.2,
   '38.0000',
   '44.5000',
   (1930, 5, 7),
   (13, 48, 0)))

print(x[0])

('TÜRKIYE-IRAN SINIR BÖLGESI', 7.6, '37.9800', '44.4800', (1930, 5, 6), (22, 34, 32))


In [81]:
if x[0] in top10:
    print(True)

In [82]:
t_rdd_top10 = t_rdd.filter(lambda x: top_10(x[0], top10) == True)
t_rdd_top10.take(5)

[(('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21)),
  ('KOZLUCA- (VAN) [North West  1.8 km]',
   4.9,
   '38.6693',
   '43.5113',
   (2011, 10, 24),
   (8, 49, 21))),
 (('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21)),
  ('BARDAKCI- (VAN) [North East  1.8 km]',
   4.8,
   '38.5747',
   '43.2832',
   (2011, 10, 24),
   (8, 28, 29))),
 (('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.4110',
   (2011, 10, 23),
   (10, 41, 21)),
  ('DIBEKDUZU- (VAN) [North West  1.7 km]',
   4.7,
   '38.6512',
   '43.2258',
   (2011, 10, 24),
   (4, 43, 1))),
 (('UGUR- (DUZCE) [North East  0.3 km]',
   7.2,
   '40.7400',
   '31.2100',
   (1999, 11, 12),
   (16, 57, 21)),
  ('BAKACAK-GOLYAKA (DUZCE) [South East  2.4 km]',
   4.7,
   '40.7200',
   '31.0000',
   (1999, 11, 13),
   (0, 54, 0))),
 (('YEMLICE- (VAN) [North West  1.5 km]',
   7.2,
   '38.7212',
   '43.

### As it can be seen on the Top_10 function test of the function is working but the codes below should be worked on a cluster in order to see sorted whole data.

In [None]:
t_rdd_top10 = t_rdd_top10.sortBy(lambda x: x[1], ascending=False)
t_rdd_top10.take(5)

In [None]:
results = t_rdd_top10.collect()

In [None]:
t_rdd_top10.take(28)