### Predicting number of dengi cases 
 - changed the number of iterations

In [2]:
from pyspark import keyword_only
from pyspark.ml import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *
from pyspark.ml.param.shared import *
from pyspark.ml.param import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from HTMLParser import HTMLParser
from math import sqrt
from math import isnan
from datetime import datetime
import numpy
import re
import random

In [3]:
dengue_features_train_schema = StructType([
  StructField('city', StringType(), True),
  StructField('year', FloatType(), True),
  StructField('weekofyear', FloatType(), True),
  StructField('week_start_date', DateType(), True),
  StructField('ndvi_ne', FloatType(), True),
  StructField('ndvi_nw', FloatType(), True),
  StructField('ndvi_se', FloatType(), True),
  StructField('ndvi_sw', FloatType(), True),
  StructField('precipitation_amt_mm', FloatType(), True),
  StructField('reanalysis_air_temp_k', FloatType(), True),
  StructField('reanalysis_avg_temp_k', FloatType(), True),
  StructField('reanalysis_dew_point_temp_k', FloatType(), True),
  StructField('reanalysis_max_air_temp_k', FloatType(), True),
  StructField('reanalysis_min_air_temp_k', FloatType(), True),
  StructField('reanalysis_precip_amt_kg_per_m2', FloatType(), True),
  StructField('reanalysis_relative_humidity_percent', FloatType(), True),
  StructField('reanalysis_sat_precip_amt_mm', FloatType(), True),
  StructField('reanalysis_specific_humidity_g_per_kg', FloatType(), True),
  StructField('reanalysis_tdtr_k', FloatType(), True),
  StructField('station_avg_temp_c', FloatType(), True),
  StructField('station_diur_temp_rng_c', FloatType(), True),
  StructField('station_max_temp_c', FloatType(), True),
  StructField('station_min_temp_c', FloatType(), True),
  StructField('station_precip_mm', FloatType(), True)
])

dengue_labels_train_schema = StructType([
  StructField('city', StringType(), True),
  StructField('year', FloatType(), True),
  StructField('weekofyear', FloatType(), True),
  StructField('total_cases', FloatType(), True)
])



In [4]:
train = spark.read.schema(dengue_features_train_schema).csv("s3a://data/dengai/dengue_features_train.csv" , header = True)
label_dataset = spark.read.schema(dengue_labels_train_schema).csv('s3a://data/dengai/dengue_labels_train.csv', header = True)

In [5]:
train_labels = train.join(label_dataset , ['city' , 'year' , 'weekofyear'])

In [6]:
train_labels.select('city').distinct().show()

In [7]:
def numerical_cities(city):
  if city == 'iq':
    return '1'
  else:
    return '2'
  
transform_city = udf ( numerical_cities , StringType())

In [8]:
train_labels_transform_city = train_labels.withColumn('num_city', transform_city('city'))
train_labels_transform_city = train_labels_transform_city.drop('city')
display(train_labels_transform_city)

year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases,num_city
1990.0,18.0,1990-04-30,0.1226,0.103725,0.1984833,0.1776167,12.42,297.57285,297.74286,292.41428,299.8,295.9,32.0,73.365715,12.42,14.012857,2.6285715,25.442858,6.9,29.4,20.0,16.0,4.0,2
1990.0,19.0,1990-05-07,0.1699,0.142175,0.1623571,0.1554857,22.82,298.21143,298.44287,293.95142,300.9,296.4,17.94,77.36857,22.82,15.372857,2.3714285,26.714285,6.3714285,31.7,22.2,8.6,5.0,2
1990.0,20.0,1990-05-14,0.03225,0.1729667,0.1572,0.1708429,34.54,298.78143,298.87857,295.4343,300.5,297.3,26.1,82.05286,34.54,16.848572,2.3,26.714285,6.4857144,32.2,22.8,41.4,4.0,2
1990.0,21.0,1990-05-21,0.1286333,0.2450667,0.2275571,0.2358857,15.36,298.98715,299.22858,295.31,301.4,297.0,13.9,80.33714,15.36,16.672857,2.4285715,27.471428,6.7714286,33.3,23.3,4.0,3.0,2
1990.0,22.0,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.5186,299.66428,295.82144,301.9,297.5,12.2,80.46,7.52,17.21,3.0142858,28.942858,9.3714285,35.0,23.9,5.8,6.0,2
1990.0,23.0,1990-06-04,,0.17485,0.2543143,0.1817429,9.58,299.63,299.76428,295.85144,302.4,298.1,26.49,79.891426,9.58,17.212856,2.1,28.114286,6.9428573,34.4,23.9,39.1,2.0,2
1990.0,24.0,1990-06-11,0.1129,0.0928,0.2050714,0.2102714,3.48,299.20715,299.22144,295.86572,301.3,297.7,38.6,82.0,3.48,17.234285,2.0428572,27.414286,6.7714286,32.2,23.3,29.7,4.0,2
1990.0,25.0,1990-06-18,0.0725,0.0725,0.1514714,0.1330286,151.12,299.59143,299.52856,296.53143,300.6,298.4,30.0,83.37572,151.12,17.977142,1.5714285,28.37143,7.6857142,33.9,22.8,21.1,5.0,2
1990.0,26.0,1990-06-25,0.10245,0.146175,0.1255714,0.1236,19.32,299.57858,299.55713,296.37857,302.1,297.7,37.51,82.76857,19.32,17.79,1.8857143,28.328571,7.385714,33.9,22.8,21.1,10.0,2
1990.0,27.0,1990-07-02,,0.12155,0.1606833,0.2025667,14.41,300.1543,300.27856,296.65143,302.3,298.7,28.4,81.281425,14.41,18.071428,2.0142858,28.328571,6.5142856,33.9,24.4,1.1,6.0,2


In [9]:
train_labels_transform_city_year = train_labels_transform_city.withColumn('year', substring(col('week_start_date'),0,4))
train_labels_transform_city_year_month = train_labels_transform_city_year.withColumn('month', substring(col('week_start_date'), 6,2))
train_labels_transform_city_year_month_day = train_labels_transform_city_year_month.withColumn('day', substring(col('week_start_date'), 9,2))

train_clean = train_labels_transform_city_year_month_day.drop('week_start_date')
display(train_clean)

year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases,num_city,month,day
1990,18.0,0.1226,0.103725,0.1984833,0.1776167,12.42,297.57285,297.74286,292.41428,299.8,295.9,32.0,73.365715,12.42,14.012857,2.6285715,25.442858,6.9,29.4,20.0,16.0,4.0,2,4,30
1990,19.0,0.1699,0.142175,0.1623571,0.1554857,22.82,298.21143,298.44287,293.95142,300.9,296.4,17.94,77.36857,22.82,15.372857,2.3714285,26.714285,6.3714285,31.7,22.2,8.6,5.0,2,5,7
1990,20.0,0.03225,0.1729667,0.1572,0.1708429,34.54,298.78143,298.87857,295.4343,300.5,297.3,26.1,82.05286,34.54,16.848572,2.3,26.714285,6.4857144,32.2,22.8,41.4,4.0,2,5,14
1990,21.0,0.1286333,0.2450667,0.2275571,0.2358857,15.36,298.98715,299.22858,295.31,301.4,297.0,13.9,80.33714,15.36,16.672857,2.4285715,27.471428,6.7714286,33.3,23.3,4.0,3.0,2,5,21
1990,22.0,0.1962,0.2622,0.2512,0.24734,7.52,299.5186,299.66428,295.82144,301.9,297.5,12.2,80.46,7.52,17.21,3.0142858,28.942858,9.3714285,35.0,23.9,5.8,6.0,2,5,28
1990,23.0,,0.17485,0.2543143,0.1817429,9.58,299.63,299.76428,295.85144,302.4,298.1,26.49,79.891426,9.58,17.212856,2.1,28.114286,6.9428573,34.4,23.9,39.1,2.0,2,6,4
1990,24.0,0.1129,0.0928,0.2050714,0.2102714,3.48,299.20715,299.22144,295.86572,301.3,297.7,38.6,82.0,3.48,17.234285,2.0428572,27.414286,6.7714286,32.2,23.3,29.7,4.0,2,6,11
1990,25.0,0.0725,0.0725,0.1514714,0.1330286,151.12,299.59143,299.52856,296.53143,300.6,298.4,30.0,83.37572,151.12,17.977142,1.5714285,28.37143,7.6857142,33.9,22.8,21.1,5.0,2,6,18
1990,26.0,0.10245,0.146175,0.1255714,0.1236,19.32,299.57858,299.55713,296.37857,302.1,297.7,37.51,82.76857,19.32,17.79,1.8857143,28.328571,7.385714,33.9,22.8,21.1,10.0,2,6,25
1990,27.0,,0.12155,0.1606833,0.2025667,14.41,300.1543,300.27856,296.65143,302.3,298.7,28.4,81.281425,14.41,18.071428,2.0142858,28.328571,6.5142856,33.9,24.4,1.1,6.0,2,7,2


In [10]:
def fill_with_mean(df, exclude=set()): 
    stats = df.agg(*(
        avg(c).alias(c) for c in df.columns if c not in exclude
    ))
    return df.na.fill(stats.first().asDict())

train_clean_nonull = fill_with_mean(train_clean, ["year", "month", "day" , "num_city"])

In [11]:
describe_mainTrain_df = train_clean_nonull.describe()

## Normalization 
  - x = value 
  - dl = min of attribute 
  - dh = max of attribute 
  - nl = min of expected range 
  - nh = max of expected range

In [13]:
mainTrain = train_clean_nonull

In [14]:
# call function
#normalize columns
def normalizing_column_1(c , dL, dH):
  nL = 0
  nH = 1
  numi = (float(c) - dL) * (nH-nL)
  denom = dH - dL
  div = float(numi) / float(denom)
  normalized = float(div + nL)
  return normalized

normalizing_column = udf(normalizing_column_1, DoubleType())


names = mainTrain.schema.names
for colname in names:
  dL = float(describe_mainTrain_df.collect()[3][colname])
  dH = float(describe_mainTrain_df.collect()[4][colname])
  mainTrain = mainTrain.withColumn('normalized_' + str(colname), 
                           normalizing_column(colname, lit(dL) , lit(dH))
                          )                                                                   
    

In [15]:
normalized_mainTrain = spark.read.parquet("s3a://dengi-ghazalg/normalized_mainTrain2_24NOV2017")
display(normalized_mainTrain)

year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases,num_city,month,day,normalized_year,normalized_weekofyear,normalized_ndvi_ne,normalized_ndvi_nw,normalized_ndvi_se,normalized_ndvi_sw,normalized_precipitation_amt_mm,normalized_reanalysis_air_temp_k,normalized_reanalysis_avg_temp_k,normalized_reanalysis_dew_point_temp_k,normalized_reanalysis_max_air_temp_k,normalized_reanalysis_min_air_temp_k,normalized_reanalysis_precip_amt_kg_per_m2,normalized_reanalysis_relative_humidity_percent,normalized_reanalysis_sat_precip_amt_mm,normalized_reanalysis_specific_humidity_g_per_kg,normalized_reanalysis_tdtr_k,normalized_station_avg_temp_c,normalized_station_diur_temp_rng_c,normalized_station_max_temp_c,normalized_station_min_temp_c,normalized_station_precip_mm,normalized_total_cases,normalized_num_city,normalized_month,normalized_day
1990,18.0,0.1226,0.103725,0.1984833,0.1776167,12.42,297.57285,297.74286,292.41428,299.8,295.9,32.0,73.365715,12.42,14.012857,2.6285715,25.442858,6.9,29.4,20.0,16.0,4.0,2,4,30,0.0,0.3269230769230769,0.5782264281858047,0.6148351641188807,0.3864178911251776,0.3955441968273029,0.0317972352183664,0.3882904510641286,0.3546679617256897,0.3146791099330512,0.1234560366030086,0.6923072228064922,0.0560911481156879,0.3816139426123314,0.0317972352183664,0.2626593365771838,0.0866601810214195,0.4300912491818692,0.2103928988598668,0.1741935237761467,0.4862385321100917,0.0294496594883121,0.0086767895878524,1.0,0.2727272727272727,1.0
1990,19.0,0.1699,0.142175,0.1623571,0.1554857,22.82,298.21143,298.44287,293.95142,300.9,296.4,17.94,77.36857,22.82,15.372857,2.3714285,26.714285,6.3714285,31.7,22.2,8.6,5.0,2,5,7,0.0,0.3461538461538461,0.6299426279277379,0.6570633848142966,0.3211902103920671,0.3592325546592883,0.0584229382868003,0.4727107308343155,0.4417806383941173,0.4892122895176079,0.1913576479311337,0.7307687612680306,0.0314461008484796,0.4796681648243855,0.0584229382868003,0.4181640394153106,0.0691333936086511,0.5653494571117644,0.1634980788845767,0.3225806943831905,0.6880734644898581,0.0158291926771023,0.0108459869848156,1.0,0.3636363636363636,0.2068965517241379
1990,20.0,0.03225,0.1729667,0.1572,0.1708429,34.54,298.78143,298.87857,295.4343,300.5,297.3,26.1,82.05286,34.54,16.848572,2.3,26.714285,6.4857144,32.2,22.8,41.4,4.0,2,5,14,0.0,0.3653846153846153,0.4794408421947168,0.6908807738412004,0.3118787810459555,0.3844300206096303,0.0884280617397013,0.5480656644327648,0.4960010449158342,0.6575845369156741,0.166666666666666,0.7999990609975979,0.0457493433505166,0.5944148651753445,0.0884280617397013,0.5868997977001934,0.0642648469663939,0.5653494571117644,0.1736375165704279,0.3548387588993195,0.7431191960606005,0.0762009967345461,0.0086767895878524,1.0,0.3636363636363636,0.4482758620689655
1990,21.0,0.1286333,0.2450667,0.2275571,0.2358857,15.36,298.98715,299.22858,295.31,301.4,297.0,13.9,80.33714,15.36,16.672857,2.4285715,27.471428,6.7714286,33.3,23.3,4.0,3.0,2,5,21,0.0,0.3846153846153846,0.5848230408777849,0.7700655447460686,0.4389120937559028,0.4911496031117487,0.039324115864509,0.5752617029479777,0.539557383250048,0.6434712203827292,0.2222218454619979,0.7769230769230787,0.0243645917940933,0.5523866077458162,0.039324115864509,0.566808299990818,0.0730282406727781,0.6458965869660073,0.1989860473276385,0.425806402391003,0.7889907556936281,0.007362414872078,0.0065075921908893,1.0,0.3636363636363636,0.6896551724137931
1990,22.0,0.1962,0.2622,0.2512,0.24734,7.52,299.5186,299.66428,295.82144,301.9,297.5,12.2,80.46,7.52,17.21,3.0142858,28.942858,9.3714285,35.0,23.9,5.8,6.0,2,5,28,0.0,0.4038461538461538,0.6586981429069864,0.78888241171328,0.4816005451010787,0.5099433532951738,0.0192524321068267,0.6455171271734512,0.5937777897717649,0.7015426841135475,0.2530860429928621,0.8153846153846172,0.0213847498847767,0.5553961017443918,0.0192524321068267,0.6282260184214411,0.1129503458900368,0.8024316747137841,0.4296577787501234,0.5354838709677419,0.8440366622504837,0.0106755019155804,0.0130151843817787,1.0,0.3636363636363636,0.9310344827586208
1990,23.0,0.14229354,0.17485,0.2543143,0.1817429,9.58,299.63,299.76428,295.85144,302.4,298.1,26.49,79.891426,9.58,17.212856,2.1,28.114286,6.9428573,34.4,23.9,39.1,2.0,2,6,4,0.0,0.4230769230769231,0.5997586751989131,0.6929491307919272,0.4872235950492959,0.4023143086345339,0.024526369492335,0.6602468017942853,0.6062230004022661,0.7049488687813317,0.2839502405237263,0.8615389310396652,0.0464329531483228,0.5414682913519275,0.024526369492335,0.6285527165334481,0.0506329066177525,0.7142857896520737,0.2141951827039429,0.4967742919921874,0.8440366622504837,0.0719676025660244,0.0043383947939262,1.0,0.4545454545454545,0.1034482758620689
1990,24.0,0.1129,0.0928,0.2050714,0.2102714,3.48,299.20715,299.22144,295.86572,301.3,297.7,38.6,82.0,3.48,17.234285,2.0428572,27.414286,6.7714286,32.2,23.3,29.7,4.0,2,6,11,0.0,0.4423076923076923,0.5676207809236062,0.6028366366743757,0.3983130424811993,0.4491227108180292,0.008909370248524,0.6043458509462227,0.5386687109010894,0.706570531471589,0.2160486291956013,0.8307701697716363,0.0676599447399142,0.5931200893930596,0.008909370248524,0.6310029523735006,0.0467380758041609,0.639817623381919,0.1989860473276385,0.3548387588993195,0.7889907556936281,0.0546659318294486,0.0086767895878524,1.0,0.4545454545454545,0.3448275862068966
1990,25.0,0.0725,0.0725,0.1514714,0.1330286,151.12,299.59143,299.52856,296.53143,300.6,298.4,30.0,83.37572,151.12,17.977142,1.5714285,28.37143,7.6857142,33.9,22.8,21.1,5.0,2,6,18,0.0,0.4615384615384615,0.5234488097827334,0.5805418939216598,0.3015355256126848,0.3223858412686476,0.3868919485847094,0.6551472924213096,0.5768892173964707,0.7821580313119177,0.1728398829330626,0.8846149151141844,0.0525854513584574,0.6268197688835376,0.3868919485847094,0.7159424986895723,0.0146056484646209,0.7416414301446144,0.2801013795946686,0.4645162274760584,0.7431191960606005,0.0388367391523462,0.0108459869848156,1.0,0.4545454545454545,0.5862068965517241
1990,26.0,0.10245,0.146175,0.1255714,0.1236,19.32,299.57858,299.55713,296.37857,302.1,297.7,37.51,82.76857,19.32,17.79,1.8857143,28.328571,7.385714,33.9,22.8,21.1,10.0,2,6,25,0.0,0.4807692307692308,0.5561951118240219,0.6614564299147696,0.2547717499976289,0.3069157794417148,0.0494623648100978,0.6534488007709751,0.580443906792305,0.7648013893984582,0.2654324755256553,0.8307701697716363,0.0657493397397602,0.6119470412919925,0.0494623648100978,0.6945444266216964,0.0360272666909809,0.7370820552744763,0.2534853926861359,0.4645162274760584,0.7431191960606005,0.0388367391523462,0.0216919739696312,1.0,0.4545454545454545,0.8275862068965517
1990,27.0,0.14229354,0.12155,0.1606833,0.2025667,14.41,300.1543,300.27856,296.65143,302.3,298.7,28.4,81.281425,14.41,18.071428,2.0142858,28.328571,6.5142856,33.9,24.4,1.1,6.0,2,7,2,0.0,0.5,0.5997586751989131,0.6344117045900741,0.3181680746226654,0.4364811428059818,0.0368919606948594,0.7295581712782429,0.6702226005076084,0.7957827699830543,0.2777770242573297,0.9076932466947132,0.0497808932840145,0.5755178294256095,0.0368919606948594,0.7267233182962757,0.0447906603973651,0.7370820552744763,0.176172344263182,0.4645162274760584,0.8899082218835113,0.0020246641337048,0.0130151843817787,1.0,0.5454545454545454,0.0344827586206896


In [16]:
normalized_mainTest = spark.read.parquet("s3a://dengi-ghazalg/normalized_mainTest2_24NOV2017")
display(normalized_mainTest)

year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,num_city,month,day,normalized_year,normalized_weekofyear,normalized_ndvi_ne,normalized_ndvi_nw,normalized_ndvi_se,normalized_ndvi_sw,normalized_precipitation_amt_mm,normalized_reanalysis_air_temp_k,normalized_reanalysis_avg_temp_k,normalized_reanalysis_dew_point_temp_k,normalized_reanalysis_max_air_temp_k,normalized_reanalysis_min_air_temp_k,normalized_reanalysis_precip_amt_kg_per_m2,normalized_reanalysis_relative_humidity_percent,normalized_reanalysis_sat_precip_amt_mm,normalized_reanalysis_specific_humidity_g_per_kg,normalized_reanalysis_tdtr_k,normalized_station_avg_temp_c,normalized_station_diur_temp_rng_c,normalized_station_max_temp_c,normalized_station_min_temp_c,normalized_station_precip_mm,normalized_num_city,normalized_month,normalized_day
2008,18.0,-0.0189,-0.0189,0.1027286,0.0912,78.6,298.49286,298.55,294.52713,301.1,296.4,25.37,78.781425,78.6,15.918571,3.1285715,26.528572,7.0571427,33.3,21.7,75.2,2,4,29,0.0,0.3269230769230769,0.4611952691855928,0.2240938666834042,0.2160235685065268,0.1947188545459153,0.4641549455186081,0.5335788450319379,0.4095298563139293,0.5316390914979416,0.1823903209758258,0.7555551034432879,0.0841738581261891,0.419244598805857,0.4641549455186081,0.4788589493950825,0.1263736344790024,0.3878504693220208,0.2821798710733968,0.5446427890232632,0.6000000610351564,0.3547169667369914,1.0,0.2727272727272727,0.9655172413793104
2008,19.0,-0.018,-0.0124,0.08204286,0.07231429,12.56,298.4757,298.55713,294.39572,300.8,296.7,21.83,78.23,12.56,15.791429,2.5714285,26.071428,5.5571427,30.0,22.2,34.3,2,5,6,0.0,0.3461538461538461,0.4621290732256281,0.2316449809989106,0.1697304757546108,0.1599842369290857,0.0741703107335343,0.5312553184778627,0.4104122530539261,0.5128010533045176,0.1635212448408022,0.7777786820023157,0.0724286659711548,0.4025666066678319,0.0741703107335343,0.4608536835786359,0.0835164817249014,0.3130839783854102,0.1417586091599399,0.25,0.6400000610351563,0.1617924492314176,1.0,0.3636363636363636,0.1724137931034483
2008,20.0,-0.0015,0.12680304,0.1510833,0.09152857,3.66,299.45572,299.35715,295.30856,302.2,296.4,4.12,78.27,3.66,16.674286,4.428571,27.928572,7.785714,32.8,22.8,3.0,2,5,13,0.0,0.3653846153846153,0.4792488067928631,0.3933585507719047,0.3242376805765426,0.1953231619665499,0.0216133228169994,0.6640229487821583,0.5092670844172361,0.6436611086788094,0.2515730947818398,0.7555551034432879,0.0136695417569976,0.4037762202987401,0.0216133228169994,0.5858789897444093,0.2263736147788254,0.6168223214006482,0.3503844827273982,0.4999999318804061,0.6879999389648438,0.0141509433962264,1.0,0.3636363636363636,0.4137931034482758
2008,21.0,0.12604976,-0.01986667,0.1243286,0.1256857,0.0,299.69,299.72858,294.40286,303.0,296.9,2.2,73.01572,0.0,15.775714,4.3428574,28.057142,6.2714286,33.3,24.4,0.3,2,5,20,0.0,0.3846153846153846,0.6115892892755594,0.2229708764515124,0.2643627064010389,0.2581450018701131,0.0,0.6957626522619352,0.5551630276757846,0.5138247646229712,0.3018867924528302,0.7925921404803249,0.0072992702312001,0.2448583260109492,0.0,0.4586282635700594,0.2197802405210704,0.6378502117565679,0.2086258894916956,0.5446427890232632,0.8159999694824219,0.0014150943958534,1.0,0.3636363636363636,0.6551724137931034
2008,22.0,0.0568,0.03983333,0.06226667,0.07591429,0.76,299.78,299.67142,294.76,302.3,297.3,4.36,74.08428,0.76,16.137142,3.5428572,27.614286,7.0857143,33.3,23.3,84.1,2,5,27,0.0,0.4038461538461538,0.5397385353491895,0.2923249647048991,0.1254728902874951,0.166605363385591,0.0044880122266638,0.7079549650875001,0.5481000828295713,0.5650234550497343,0.2578608674823114,0.822221317997686,0.014465826587639,0.2771775315682526,0.0044880122266638,0.509811708281211,0.1582417628905776,0.5654205343533093,0.2848545783566051,0.5446427890232632,0.7279999389648438,0.3966981060100051,1.0,0.3636363636363636,0.896551724137931
2008,23.0,-0.044,-0.03046667,0.132,0.08352857,71.17,299.7686,299.72858,295.3143,301.9,297.6,22.55,76.557144,71.17,16.667143,2.857143,28.0,5.1714287,32.8,25.0,27.7,2,6,3,0.0,0.4230769230769231,0.4351525215173465,0.2106567493373299,0.2815307108711904,0.1806095506638973,0.420278718371001,0.706408703644041,0.5551630276757846,0.6444835776013277,0.2327040186468162,0.8444448965567138,0.0748175157168564,0.3519702822151218,0.420278718371001,0.5848674351950564,0.1054945120700619,0.6285045173705455,0.1056503063495706,0.4999999318804061,0.8640000000000001,0.1306603809572615,1.0,0.4545454545454545,0.0689655172413793
2008,24.0,-0.0443,-0.024925,0.1322714,0.1591571,48.99,300.06287,300.00714,295.65,302.4,297.5,13.1,76.844284,48.99,17.01,3.1571429,27.4,6.042857,31.1,23.3,91.7,2,6,10,0.0,0.4423076923076923,0.4348412522155978,0.2170945642391379,0.2821380716434518,0.3197056206696448,0.2892996437844974,0.7462766156741945,0.5895840423881361,0.6926067592380357,0.2641505595273193,0.8370370370370379,0.0434638367002977,0.3606549481084572,0.2892996437844974,0.6334209731319023,0.1285714320115511,0.5303736344941403,0.1872283651421799,0.3482143197740828,0.7279999389648438,0.4325471554162367,1.0,0.4545454545454545,0.3103448275862069
2008,25.0,0.12604976,0.08215,0.1443714,0.1167286,30.81,300.48428,300.57858,295.99713,303.5,297.5,7.2,76.87,30.81,17.42,3.9,28.757143,6.9857144,34.4,24.4,0.3,2,6,17,0.0,0.4615384615384615,0.6115892892755594,0.3414846623531061,0.3092169647181897,0.241671074568319,0.1819416526865618,0.8033683918044849,0.6601946362190718,0.742370503885086,0.3333333333333333,0.8370370370370379,0.0238885196060555,0.3614328205902946,0.1819416526865618,0.6914828537246419,0.1857142962370554,0.7523362937706201,0.2754931698234514,0.6428572790963311,0.8159999694824219,0.0014150943958534,1.0,0.4545454545454545,0.5517241379310345
2008,26.0,0.0108,0.0499,0.1005714,0.1173286,8.02,300.60144,300.62143,296.2686,302.5,298.5,17.1,77.39571,8.02,17.678572,2.7857144,28.657143,6.242857,32.8,23.9,28.7,2,6,24,0.0,0.4807692307692308,0.4920107909111245,0.3040195156276558,0.2111959290776588,0.2427746002121086,0.0473603428473111,0.8192403107388165,0.6654890166590522,0.7812846584904057,0.2704402515723272,0.911111111111112,0.0567352368330117,0.3773331710007042,0.0473603428473111,0.7281003180871469,0.1000000090687445,0.735981094632973,0.2059511822084873,0.4999999318804061,0.7759999694824219,0.135377362089337,1.0,0.4545454545454545,0.7931034482758621
2008,27.0,0.07266667,0.10666,0.1554286,0.1649,17.52,300.49713,300.52856,296.41144,302.3,298.7,11.9,78.53429,17.52,17.80857,2.2285714,28.457144,4.6285715,31.1,25.0,2.9,2,7,1,0.0,0.5,0.556201148995425,0.3699581795770217,0.3339621224629764,0.330267982570441,0.1034604963845734,0.8051089695255981,0.6540140881128553,0.8017632596941722,0.2578608674823114,0.9259268301504638,0.0394824141291648,0.4117697772976517,0.1034604963845734,0.7465100706693228,0.0571428563146435,0.703271008307156,0.054831180439631,0.3482143197740828,0.8640000000000001,0.0136792457328652,1.0,0.5454545454545454,0.0


In [17]:
normalized_mainTest.select('normalized_num_city').groupby('normalized_num_city').count().show()

## Tensor Flow

In [19]:
import pandas as pd
import numpy as np

In [20]:
normalized_mainTrain_pd = normalized_mainTrain.toPandas()
normalized_mainTest_pd = normalized_mainTest.toPandas()

In [21]:
normalized_mainTest_pd.year = normalized_mainTest_pd.year.astype(float)

In [22]:
non_feature_columns = ['year' , 'weekofyear']

In [23]:
feature_columns = [
  'normalized_year', 
  'normalized_weekofyear', 
  'normalized_ndvi_ne', 
  'normalized_ndvi_nw',
  'normalized_ndvi_se', 
  'normalized_ndvi_sw', 
  'normalized_precipitation_amt_mm',
  'normalized_reanalysis_air_temp_k', 
  'normalized_reanalysis_avg_temp_k',
  'normalized_reanalysis_dew_point_temp_k', 
  'normalized_reanalysis_max_air_temp_k',
  'normalized_reanalysis_min_air_temp_k',
  'normalized_reanalysis_precip_amt_kg_per_m2',
  'normalized_reanalysis_relative_humidity_percent',
  'normalized_reanalysis_sat_precip_amt_mm',
  'normalized_reanalysis_specific_humidity_g_per_kg', 
  'normalized_reanalysis_tdtr_k',
  'normalized_station_avg_temp_c', 
  'normalized_station_diur_temp_rng_c',
  'normalized_station_max_temp_c', 
  'normalized_station_min_temp_c',
  'normalized_station_precip_mm', 
  'normalized_num_city',
  'normalized_month',
  'normalized_day'
]

label_columns = ['total_cases']

In [24]:
import tensorflow as tf

tf.reset_default_graph()

feature_columns_tf = [
  tf.feature_column.numeric_column("normalized_year"), 
  tf.feature_column.numeric_column("normalized_weekofyear"), 
  tf.feature_column.numeric_column("normalized_ndvi_ne"), 
  tf.feature_column.numeric_column("normalized_ndvi_nw"),
  tf.feature_column.numeric_column("normalized_ndvi_se"), 
  tf.feature_column.numeric_column("normalized_ndvi_sw"), 
  tf.feature_column.numeric_column("normalized_precipitation_amt_mm"),
  tf.feature_column.numeric_column("normalized_reanalysis_air_temp_k"), 
  tf.feature_column.numeric_column("normalized_reanalysis_avg_temp_k"),
  tf.feature_column.numeric_column("normalized_reanalysis_dew_point_temp_k"), 
  tf.feature_column.numeric_column("normalized_reanalysis_max_air_temp_k"),
  tf.feature_column.numeric_column("normalized_reanalysis_min_air_temp_k"),
  tf.feature_column.numeric_column("normalized_reanalysis_precip_amt_kg_per_m2"),
  tf.feature_column.numeric_column("normalized_reanalysis_relative_humidity_percent"),
  tf.feature_column.numeric_column("normalized_reanalysis_sat_precip_amt_mm"),
  tf.feature_column.numeric_column("normalized_reanalysis_specific_humidity_g_per_kg"), 
  tf.feature_column.numeric_column("normalized_reanalysis_tdtr_k"),
  tf.feature_column.numeric_column("normalized_station_avg_temp_c"), 
  tf.feature_column.numeric_column("normalized_station_diur_temp_rng_c"),
  tf.feature_column.numeric_column("normalized_station_max_temp_c"), 
  tf.feature_column.numeric_column("normalized_station_min_temp_c"),
  tf.feature_column.numeric_column("normalized_station_precip_mm"), 
  tf.feature_column.numeric_column("normalized_num_city"),
  tf.feature_column.numeric_column("normalized_month"),
  tf.feature_column.numeric_column("normalized_day")
]

# Define the train inputs
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    x = normalized_mainTrain_pd[feature_columns],
    y = normalized_mainTrain_pd[label_columns],
    num_epochs=None,
    shuffle=True)




# Define train inputs for evaluation
train_input_fn_eval = tf.estimator.inputs.pandas_input_fn(
    x = normalized_mainTrain_pd[feature_columns],
    y = normalized_mainTrain_pd[label_columns],
    num_epochs = 1,
    shuffle = False)


# Define test inputs for evaluation
test_input_fn_eval = tf.estimator.inputs.pandas_input_fn(
    x = normalized_mainTest_pd[feature_columns],
    y = None,
    num_epochs=1,
    shuffle=False)



classifier = tf.estimator.DNNRegressor(
  feature_columns = feature_columns_tf,
  hidden_units = [200, 300, 200, 300, 200, 300],
  optimizer = tf.train.ProximalAdagradOptimizer(
    learning_rate=0.01,
    l1_regularization_strength=0.001
  ),
  activation_fn = tf.nn.relu,
  model_dir= "/tmp/tf_dengai_mainTrain_ghazall_24Nov_v3"
)

In [25]:
# Train model.
classifier.train(input_fn = train_input_fn, steps=50000)

In [26]:
# Evaluate accuracy.
average_loss_train = classifier.evaluate(input_fn = train_input_fn_eval)["average_loss"]
print("Train Average Loss: {0:f}\n".format(average_loss_train))

In [27]:
# Evaluate accuracy.
average_loss_test = classifier.predict(input_fn = test_input_fn_eval)
#print("Test Average Loss: {0:f}\n".format(average_loss_test))

In [28]:
predictions_list = [prediction['predictions'][0] for prediction in average_loss_test]
len(predictions_list)

In [29]:
print predictions_list

In [30]:
list_to_df = pd.DataFrame({'predictions':predictions_list})
print (list_to_df)

In [31]:
cons = normalized_mainTest_pd[feature_columns].join(list_to_df)

In [32]:
spark.createDataFrame(
    cons
  ).count()

In [33]:
import math

In [34]:
def roundup(value):
  return math.ceil(value)

roundup = udf(roundup,FloatType())

def str_city(city):
  if(city == 1):
    return 'sj'
  elif(city == 2):
    return 'iq'

str_city = udf(str_city,StringType())

In [35]:
normalized_mainTest_pd.year = normalized_mainTest_pd.year.astype(float)

In [36]:
normalized_mainTest_pd_selected = normalized_mainTest_pd[['year', 'weekofyear']]


In [37]:
normalized_mainTest_pd_selected

In [38]:
result = pd.concat([cons, normalized_mainTest_pd_selected], axis=1)

In [39]:
pd_to_spark = spark.createDataFrame(result)
display(pd_to_spark)

normalized_year,normalized_weekofyear,normalized_ndvi_ne,normalized_ndvi_nw,normalized_ndvi_se,normalized_ndvi_sw,normalized_precipitation_amt_mm,normalized_reanalysis_air_temp_k,normalized_reanalysis_avg_temp_k,normalized_reanalysis_dew_point_temp_k,normalized_reanalysis_max_air_temp_k,normalized_reanalysis_min_air_temp_k,normalized_reanalysis_precip_amt_kg_per_m2,normalized_reanalysis_relative_humidity_percent,normalized_reanalysis_sat_precip_amt_mm,normalized_reanalysis_specific_humidity_g_per_kg,normalized_reanalysis_tdtr_k,normalized_station_avg_temp_c,normalized_station_diur_temp_rng_c,normalized_station_max_temp_c,normalized_station_min_temp_c,normalized_station_precip_mm,normalized_num_city,normalized_month,normalized_day,predictions,year,weekofyear
0.0,0.3269230769230769,0.4611952691855928,0.2240938666834042,0.2160235685065268,0.1947188545459153,0.4641549455186081,0.5335788450319379,0.4095298563139293,0.5316390914979416,0.1823903209758258,0.7555551034432879,0.0841738581261891,0.419244598805857,0.4641549455186081,0.4788589493950825,0.1263736344790024,0.3878504693220208,0.2821798710733968,0.5446427890232632,0.6000000610351564,0.3547169667369914,1.0,0.2727272727272727,0.9655172413793104,6.585016250610352,2008.0,18.0
0.0,0.3461538461538461,0.4621290732256281,0.2316449809989106,0.1697304757546108,0.1599842369290857,0.0741703107335343,0.5312553184778627,0.4104122530539261,0.5128010533045176,0.1635212448408022,0.7777786820023157,0.0724286659711548,0.4025666066678319,0.0741703107335343,0.4608536835786359,0.0835164817249014,0.3130839783854102,0.1417586091599399,0.25,0.6400000610351563,0.1617924492314176,1.0,0.3636363636363636,0.1724137931034483,35.38242721557617,2008.0,19.0
0.0,0.3653846153846153,0.4792488067928631,0.3933585507719047,0.3242376805765426,0.1953231619665499,0.0216133228169994,0.6640229487821583,0.5092670844172361,0.6436611086788094,0.2515730947818398,0.7555551034432879,0.0136695417569976,0.4037762202987401,0.0216133228169994,0.5858789897444093,0.2263736147788254,0.6168223214006482,0.3503844827273982,0.4999999318804061,0.6879999389648438,0.0141509433962264,1.0,0.3636363636363636,0.4137931034482758,28.50784683227539,2008.0,20.0
0.0,0.3846153846153846,0.6115892892755594,0.2229708764515124,0.2643627064010389,0.2581450018701131,0.0,0.6957626522619352,0.5551630276757846,0.5138247646229712,0.3018867924528302,0.7925921404803249,0.0072992702312001,0.2448583260109492,0.0,0.4586282635700594,0.2197802405210704,0.6378502117565679,0.2086258894916956,0.5446427890232632,0.8159999694824219,0.0014150943958534,1.0,0.3636363636363636,0.6551724137931034,29.41876602172852,2008.0,21.0
0.0,0.4038461538461538,0.5397385353491895,0.2923249647048991,0.1254728902874951,0.166605363385591,0.0044880122266638,0.7079549650875001,0.5481000828295713,0.5650234550497343,0.2578608674823114,0.822221317997686,0.014465826587639,0.2771775315682526,0.0044880122266638,0.509811708281211,0.1582417628905776,0.5654205343533093,0.2848545783566051,0.5446427890232632,0.7279999389648438,0.3966981060100051,1.0,0.3636363636363636,0.896551724137931,6.366959095001221,2008.0,22.0
0.0,0.4230769230769231,0.4351525215173465,0.2106567493373299,0.2815307108711904,0.1806095506638973,0.420278718371001,0.706408703644041,0.5551630276757846,0.6444835776013277,0.2327040186468162,0.8444448965567138,0.0748175157168564,0.3519702822151218,0.420278718371001,0.5848674351950564,0.1054945120700619,0.6285045173705455,0.1056503063495706,0.4999999318804061,0.8640000000000001,0.1306603809572615,1.0,0.4545454545454545,0.0689655172413793,25.28208541870117,2008.0,23.0
0.0,0.4423076923076923,0.4348412522155978,0.2170945642391379,0.2821380716434518,0.3197056206696448,0.2892996437844974,0.7462766156741945,0.5895840423881361,0.6926067592380357,0.2641505595273193,0.8370370370370379,0.0434638367002977,0.3606549481084572,0.2892996437844974,0.6334209731319023,0.1285714320115511,0.5303736344941403,0.1872283651421799,0.3482143197740828,0.7279999389648438,0.4325471554162367,1.0,0.4545454545454545,0.3103448275862069,2.430901288986206,2008.0,24.0
0.0,0.4615384615384615,0.6115892892755594,0.3414846623531061,0.3092169647181897,0.241671074568319,0.1819416526865618,0.8033683918044849,0.6601946362190718,0.742370503885086,0.3333333333333333,0.8370370370370379,0.0238885196060555,0.3614328205902946,0.1819416526865618,0.6914828537246419,0.1857142962370554,0.7523362937706201,0.2754931698234514,0.6428572790963311,0.8159999694824219,0.0014150943958534,1.0,0.4545454545454545,0.5517241379310345,26.46112632751465,2008.0,25.0
0.0,0.4807692307692308,0.4920107909111245,0.3040195156276558,0.2111959290776588,0.2427746002121086,0.0473603428473111,0.8192403107388165,0.6654890166590522,0.7812846584904057,0.2704402515723272,0.911111111111112,0.0567352368330117,0.3773331710007042,0.0473603428473111,0.7281003180871469,0.1000000090687445,0.735981094632973,0.2059511822084873,0.4999999318804061,0.7759999694824219,0.135377362089337,1.0,0.4545454545454545,0.7931034482758621,12.803142547607422,2008.0,26.0
0.0,0.5,0.556201148995425,0.3699581795770217,0.3339621224629764,0.330267982570441,0.1034604963845734,0.8051089695255981,0.6540140881128553,0.8017632596941722,0.2578608674823114,0.9259268301504638,0.0394824141291648,0.4117697772976517,0.1034604963845734,0.7465100706693228,0.0571428563146435,0.703271008307156,0.054831180439631,0.3482143197740828,0.8640000000000001,0.0136792457328652,1.0,0.5454545454545454,0.0,22.84199333190918,2008.0,27.0


In [40]:
new_df = pd_to_spark.select(
   col('normalized_num_city').alias('city'),  
   col('year'),
   col('weekofyear'),
   col('predictions').alias("total_cases")
)

display(new_df)

city,year,weekofyear,total_cases
1.0,2008.0,18.0,6.585016250610352
1.0,2008.0,19.0,35.38242721557617
1.0,2008.0,20.0,28.50784683227539
1.0,2008.0,21.0,29.41876602172852
1.0,2008.0,22.0,6.366959095001221
1.0,2008.0,23.0,25.28208541870117
1.0,2008.0,24.0,2.430901288986206
1.0,2008.0,25.0,26.46112632751465
1.0,2008.0,26.0,12.803142547607422
1.0,2008.0,27.0,22.84199333190918


In [41]:
new_df.select('city').distinct().show()

In [42]:
def str_city(city):
  if(city == 1):
    return 'sj'
  elif(city == 0):
    return 'iq'

str_city = udf(str_city,StringType())

In [43]:
final_df3 = new_df.withColumn("city2", str_city('city')).drop('city')

In [44]:
display(final_df3)

year,weekofyear,total_cases,city2
2008.0,18.0,6.585016250610352,sj
2008.0,19.0,35.38242721557617,sj
2008.0,20.0,28.50784683227539,sj
2008.0,21.0,29.41876602172852,sj
2008.0,22.0,6.366959095001221,sj
2008.0,23.0,25.28208541870117,sj
2008.0,24.0,2.430901288986206,sj
2008.0,25.0,26.46112632751465,sj
2008.0,26.0,12.803142547607422,sj
2008.0,27.0,22.84199333190918,sj


In [45]:
submission_df12 = final_df3.select(
  col('city2').alias('city'),  
  col('year'),
  col('weekofyear'),
  col('total_cases')
)

In [46]:
display(submission_df12)

city,year,weekofyear,total_cases
sj,2008.0,18.0,6.585016250610352
sj,2008.0,19.0,35.38242721557617
sj,2008.0,20.0,28.50784683227539
sj,2008.0,21.0,29.41876602172852
sj,2008.0,22.0,6.366959095001221
sj,2008.0,23.0,25.28208541870117
sj,2008.0,24.0,2.430901288986206
sj,2008.0,25.0,26.46112632751465
sj,2008.0,26.0,12.803142547607422
sj,2008.0,27.0,22.84199333190918


In [47]:
submission_df12.printSchema()

In [48]:
 import math

In [49]:
def roundup(value):
  return math.ceil(value)

roundup = udf(roundup,FloatType())
submission_df12 = submission_df12.withColumn('total_cases2', roundup('total_cases'))


In [50]:
display(submission_df12)

city,year,weekofyear,total_cases,total_cases2
sj,2008.0,18.0,6.585016250610352,7.0
sj,2008.0,19.0,35.38242721557617,36.0
sj,2008.0,20.0,28.50784683227539,29.0
sj,2008.0,21.0,29.41876602172852,30.0
sj,2008.0,22.0,6.366959095001221,7.0
sj,2008.0,23.0,25.28208541870117,26.0
sj,2008.0,24.0,2.430901288986206,3.0
sj,2008.0,25.0,26.46112632751465,27.0
sj,2008.0,26.0,12.803142547607422,13.0
sj,2008.0,27.0,22.84199333190918,23.0


In [51]:
submission_df6 = submission_df12.drop('total_cases')
display(submission_df6)

city,year,weekofyear,total_cases2
sj,2008.0,18.0,7.0
sj,2008.0,19.0,36.0
sj,2008.0,20.0,29.0
sj,2008.0,21.0,30.0
sj,2008.0,22.0,7.0
sj,2008.0,23.0,26.0
sj,2008.0,24.0,3.0
sj,2008.0,25.0,27.0
sj,2008.0,26.0,13.0
sj,2008.0,27.0,23.0


In [52]:
submission_df7 = submission_df6.select(
  col('city'), 
  col('year'), 
  col('weekofyear'), 
  col('total_cases2').alias('total_cases')
)

In [53]:
display(submission_df7)

city,year,weekofyear,total_cases
sj,2008.0,18.0,7.0
sj,2008.0,19.0,36.0
sj,2008.0,20.0,29.0
sj,2008.0,21.0,30.0
sj,2008.0,22.0,7.0
sj,2008.0,23.0,26.0
sj,2008.0,24.0,3.0
sj,2008.0,25.0,27.0
sj,2008.0,26.0,13.0
sj,2008.0,27.0,23.0


In [54]:
submission_df7.printSchema()