In [2]:
import pyspark
import pyspark.sql  as pyspark_sql
import pyspark.sql.types as pyspark_types
import pyspark.sql.functions  as pyspark_functions
from pyspark import SparkContext, SparkConf
from pandas import isnull
from numpy import count_nonzero
from pyspark.sql.functions import col, count, isnan, when, coalesce, lag, lead, sum
from pyspark.sql.window import Window

import warnings
warnings.filterwarnings("ignore")

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = pyspark_sql.SparkSession.builder.getOrCreate()

In [4]:
# Load the dataset
df = spark.read.csv("Dataset/col_mat_nuw_output.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- 0.00019698343957810148: double (nullable = true)
 |-- Colombo Proper: string (nullable = true)
 |-- 2019-01-01: date (nullable = true)
 |-- 2019-01-02: date (nullable = true)



In [8]:
df.show()

+----------------------+--------------+----------+----------+
|0.00019698343957810148|Colombo Proper|2019-01-01|2019-01-02|
+----------------------+--------------+----------+----------+
|  2.625522171968594...|Colombo Proper|2019-01-02|2019-01-03|
|  9.852118897938794E-5|Colombo Proper|2019-01-03|2019-01-04|
|  2.099320518114242E-4|Colombo Proper|2019-01-04|2019-01-05|
|  1.785337298892930...|Colombo Proper|2019-01-05|2019-01-06|
|  1.082296700235670...|Colombo Proper|2019-01-06|2019-01-07|
|  3.926829280477309...|Colombo Proper|2019-01-07|2019-01-08|
|  9.153156350685351E-5|Colombo Proper|2019-01-08|2019-01-09|
|  1.205978992853015...|Colombo Proper|2019-01-09|2019-01-10|
|  1.297723562983258...|Colombo Proper|2019-01-10|2019-01-11|
|  2.239188166801278...|Colombo Proper|2019-01-11|2019-01-12|
|  1.569418094178759...|Colombo Proper|2019-01-12|2019-01-13|
|                  NULL|Colombo Proper|2019-01-13|2019-01-14|
|  1.336291906862603...|Colombo Proper|2019-01-14|2019-01-15|
|  6.374

In [9]:
column_name = ['HCHO reading', 'Location', 'Current Date', 'Next Date']

# Rename columns using withColumnRenamed()
for i, new_name in enumerate(column_name):
    df = df.withColumnRenamed(df.columns[i], new_name)

# Display the DataFrame
df.show()

+--------------------+--------------+------------+----------+
|        HCHO reading|      Location|Current Date| Next Date|
+--------------------+--------------+------------+----------+
|2.625522171968594...|Colombo Proper|  2019-01-02|2019-01-03|
|9.852118897938794E-5|Colombo Proper|  2019-01-03|2019-01-04|
|2.099320518114242E-4|Colombo Proper|  2019-01-04|2019-01-05|
|1.785337298892930...|Colombo Proper|  2019-01-05|2019-01-06|
|1.082296700235670...|Colombo Proper|  2019-01-06|2019-01-07|
|3.926829280477309...|Colombo Proper|  2019-01-07|2019-01-08|
|9.153156350685351E-5|Colombo Proper|  2019-01-08|2019-01-09|
|1.205978992853015...|Colombo Proper|  2019-01-09|2019-01-10|
|1.297723562983258...|Colombo Proper|  2019-01-10|2019-01-11|
|2.239188166801278...|Colombo Proper|  2019-01-11|2019-01-12|
|1.569418094178759...|Colombo Proper|  2019-01-12|2019-01-13|
|                NULL|Colombo Proper|  2019-01-13|2019-01-14|
|1.336291906862603...|Colombo Proper|  2019-01-14|2019-01-15|
|6.37441

In [10]:
df1 = spark.read.csv("Dataset/kan_output.csv", header=True, inferSchema=True)
df1.printSchema()

root
 |-- 0.00017607134598773356: double (nullable = true)
 |-- Kandy Proper: string (nullable = true)
 |-- 2019-01-01: date (nullable = true)
 |-- 2019-01-02: date (nullable = true)



In [11]:
df1.show()

+----------------------+------------+----------+----------+
|0.00017607134598773356|Kandy Proper|2019-01-01|2019-01-02|
+----------------------+------------+----------+----------+
|  9.220391253917748E-5|Kandy Proper|2019-01-02|2019-01-03|
|                  NULL|Kandy Proper|2019-01-03|2019-01-04|
|  1.908681983853839...|Kandy Proper|2019-01-04|2019-01-05|
|  1.219517840206744...|Kandy Proper|2019-01-05|2019-01-06|
|  -6.51408612938880...|Kandy Proper|2019-01-06|2019-01-07|
|    1.6323820639265E-4|Kandy Proper|2019-01-07|2019-01-08|
|  -6.73520553391426...|Kandy Proper|2019-01-08|2019-01-09|
|  1.279693658243135...|Kandy Proper|2019-01-09|2019-01-10|
|  4.546048424126012E-5|Kandy Proper|2019-01-10|2019-01-11|
|  3.600074175192105E-5|Kandy Proper|2019-01-11|2019-01-12|
|  1.286629698010177E-4|Kandy Proper|2019-01-12|2019-01-13|
|                  NULL|Kandy Proper|2019-01-13|2019-01-14|
|                  NULL|Kandy Proper|2019-01-14|2019-01-15|
|   9.63639634671553E-5|Kandy Proper|201