# Milestone 1 - Group 18

In [1]:
import pandas as pd

import findspark
findspark.init()
from pyspark.sql import functions as sf
from pyspark.sql.functions import when, col, trim, to_timestamp
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import lead, lag
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql.functions import udf
import matplotlib.pyplot as plt
import matplotlib as mp
import numpy as np
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Loading the Data

In [2]:
raw_df = spark.read.option("header",True) \
    .csv("MS1.csv",inferSchema=True).toDF(
  "Dummy", "Date", "Price", "Volume"
)
print('Number of rows in raw data:', raw_df.count())

raw_df.printSchema()

Number of rows in raw data: 22928521
root
 |-- Dummy: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Volume: string (nullable = true)



# View the Data

In [3]:
raw_df.show()

+--------------------+----------+-----+------+
|               Dummy|      Date|Price|Volume|
+--------------------+----------+-----+------+
|32843.Nordamerika...|01/04/2016|18.52| 51616|
|32843.Nordamerika...|01/05/2016|19.15| 54898|
|32843.Nordamerika...|01/06/2016|19.71| 41555|
|32843.Nordamerika...|01/07/2016|19.17| 44430|
|32843.Nordamerika...|01/08/2016|18.94| 72673|
|32843.Nordamerika...|01/11/2016| 19.1| 45426|
|32843.Nordamerika...|01/12/2016|19.39| 61457|
|32843.Nordamerika...|01/13/2016|19.27| 61805|
|32843.Nordamerika...|01/14/2016|19.17| 35597|
|32843.Nordamerika...|01/15/2016|18.81| 69227|
|32843.Nordamerika...|01/18/2016|18.81| 69227|
|32843.Nordamerika...|01/19/2016|18.82| 23700|
|32843.Nordamerika...|01/20/2016|17.97| 41439|
|32843.Nordamerika...|01/21/2016|17.82| 35240|
|32843.Nordamerika...|01/22/2016|18.62| 79139|
|32843.Nordamerika...|01/25/2016|18.09| 89251|
|32843.Nordamerika...|01/26/2016|17.62| 75300|
|32843.Nordamerika...|01/27/2016|17.58| 43142|
|32843.Nordam

# Data Pre-processing

In [4]:
from pyspark.sql.functions import split
import pyspark.sql.functions as f
raw_df=raw_df.withColumn("Number", split(col("Dummy"), "[.]").getItem(0)).withColumn("Name", split(col("Dummy"), "[.]").getItem(1))

In [5]:
raw_df=raw_df.drop(raw_df.Dummy)


In [6]:
raw_df.show(100)
raw_df.count()

+----------+-------+------+------+--------------------+
|      Date|  Price|Volume|Number|                Name|
+----------+-------+------+------+--------------------+
|01/04/2016|  18.52| 51616| 32843|Nordamerika_USA-N...|
|01/05/2016|  19.15| 54898| 32843|Nordamerika_USA-N...|
|01/06/2016|  19.71| 41555| 32843|Nordamerika_USA-N...|
|01/07/2016|  19.17| 44430| 32843|Nordamerika_USA-N...|
|01/08/2016|  18.94| 72673| 32843|Nordamerika_USA-N...|
|01/11/2016|   19.1| 45426| 32843|Nordamerika_USA-N...|
|01/12/2016|  19.39| 61457| 32843|Nordamerika_USA-N...|
|01/13/2016|  19.27| 61805| 32843|Nordamerika_USA-N...|
|01/14/2016|  19.17| 35597| 32843|Nordamerika_USA-N...|
|01/15/2016|  18.81| 69227| 32843|Nordamerika_USA-N...|
|01/18/2016|  18.81| 69227| 32843|Nordamerika_USA-N...|
|01/19/2016|  18.82| 23700| 32843|Nordamerika_USA-N...|
|01/20/2016|  17.97| 41439| 32843|Nordamerika_USA-N...|
|01/21/2016|  17.82| 35240| 32843|Nordamerika_USA-N...|
|01/22/2016|  18.62| 79139| 32843|Nordamerika_US

22928521

In [7]:
raw_df

DataFrame[Date: string, Price: double, Volume: string, Number: string, Name: string]

In [8]:
#delete missing values
raw_df=raw_df.na.drop()
raw_df.count()

22928520

# Loading Time Series Data

In [9]:
time_df = spark.read.option("header",True) \
    .csv("parking.csv")
print('Number of rows in raw data:', time_df.count())
time_df.show(10)
time_df.printSchema()

Number of rows in raw data: 10626899
+--------------+--------+------------------+----------+----------+--------------+-----------------+------------+--------------+------------+------------+------------+-----------------------+------------------+------------------+---------------+-----------+--------------+------------+--------------+-------------------+----------------+---------------------------------+------------+-------------+-------------------+-------------------+-----------+------------+--------------------+--------------------------+--------------------+------------------+-------------+---------------------+------------+------------+--------------+-------------------+---------------------+---------------------------------+-----------------+------------------------+--------+---------+---------------+------------------+------------+----+----+----+
|Summons Number|Plate ID|Registration State|Plate Type|Issue Date|Violation Code|Vehicle Body Type|Vehicle Make|Issuing Agency|Street 

In [10]:
#dropping the last three columns
col_drop=["BIN","BBL","NTA"]
for col in col_drop:
    time_df=time_df. drop(col)

#prinitng new schema
time_df.printSchema()
time_df.count()

root
 |-- Summons Number: string (nullable = true)
 |-- Plate ID: string (nullable = true)
 |-- Registration State: string (nullable = true)
 |-- Plate Type: string (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Violation Code: string (nullable = true)
 |-- Vehicle Body Type: string (nullable = true)
 |-- Vehicle Make: string (nullable = true)
 |-- Issuing Agency: string (nullable = true)
 |-- Street Code1: string (nullable = true)
 |-- Street Code2: string (nullable = true)
 |-- Street Code3: string (nullable = true)
 |-- Vehicle Expiration Date: string (nullable = true)
 |-- Violation Location: string (nullable = true)
 |-- Violation Precinct: string (nullable = true)
 |-- Issuer Precinct: string (nullable = true)
 |-- Issuer Code: string (nullable = true)
 |-- Issuer Command: string (nullable = true)
 |-- Issuer Squad: string (nullable = true)
 |-- Violation Time: string (nullable = true)
 |-- Time First Observed: string (nullable = true)
 |-- Violation County: str

10626899

In [11]:
time_df = time_df.withColumnRenamed("Issue Date", "Date")
time_df.printSchema()

root
 |-- Summons Number: string (nullable = true)
 |-- Plate ID: string (nullable = true)
 |-- Registration State: string (nullable = true)
 |-- Plate Type: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Violation Code: string (nullable = true)
 |-- Vehicle Body Type: string (nullable = true)
 |-- Vehicle Make: string (nullable = true)
 |-- Issuing Agency: string (nullable = true)
 |-- Street Code1: string (nullable = true)
 |-- Street Code2: string (nullable = true)
 |-- Street Code3: string (nullable = true)
 |-- Vehicle Expiration Date: string (nullable = true)
 |-- Violation Location: string (nullable = true)
 |-- Violation Precinct: string (nullable = true)
 |-- Issuer Precinct: string (nullable = true)
 |-- Issuer Code: string (nullable = true)
 |-- Issuer Command: string (nullable = true)
 |-- Issuer Squad: string (nullable = true)
 |-- Violation Time: string (nullable = true)
 |-- Time First Observed: string (nullable = true)
 |-- Violation County: string (n

# Merging time series dataset with stocks dataset 

In [12]:
df = raw_df.join(time_df,['Date'],how='inner').distinct()
df.show(10)

+----------+--------+-------+------+--------------------+--------------+--------+------------------+----------+--------------+-----------------+------------+--------------+------------+------------+------------+-----------------------+------------------+------------------+---------------+-----------+--------------+------------+--------------+-------------------+----------------+---------------------------------+------------+-------------+-------------------+-------------------+-----------+------------+--------------------+--------------------------+--------------------+------------------+-------------+---------------------+------------+------------+--------------+-------------------+---------------------+---------------------------------+-----------------+------------------------+--------+---------+---------------+------------------+------------+
|      Date|   Price| Volume|Number|                Name|Summons Number|Plate ID|Registration State|Plate Type|Violation Code|Vehicle Body Ty

In [13]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Number: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Summons Number: string (nullable = true)
 |-- Plate ID: string (nullable = true)
 |-- Registration State: string (nullable = true)
 |-- Plate Type: string (nullable = true)
 |-- Violation Code: string (nullable = true)
 |-- Vehicle Body Type: string (nullable = true)
 |-- Vehicle Make: string (nullable = true)
 |-- Issuing Agency: string (nullable = true)
 |-- Street Code1: string (nullable = true)
 |-- Street Code2: string (nullable = true)
 |-- Street Code3: string (nullable = true)
 |-- Vehicle Expiration Date: string (nullable = true)
 |-- Violation Location: string (nullable = true)
 |-- Violation Precinct: string (nullable = true)
 |-- Issuer Precinct: string (nullable = true)
 |-- Issuer Code: string (nullable = true)
 |-- Issuer Command: string (nullable = true)
 |-- Issuer Squad: strin

In [None]:
df.na.drop(how="any").show(truncate=False)

In [None]:
df.show(10)

In [None]:
df.count()