In [1]:
import pyspark
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import (VectorAssembler, VectorIndexer)
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.appName("spark").config('spark.driver.memory', '32g').config("hive.server2.thrift.port", 10000).config("spark.sql.hive.thriftServer.singleSession", True).enableHiveSupport().getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/10 21:18:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df1 = spark.read.format("csv").option("header", "true").option('inferSchema', "true").load("/Users/minoseah629/Repos/AIT614Project/Data_Preparation/Python/raw_data/raw.csv") 
df1.printSchema()



root
 |-- ID: integer (nullable = true)
 |-- SPEED: double (nullable = true)
 |-- TRAVEL_TIME: integer (nullable = true)
 |-- STATUS: integer (nullable = true)
 |-- DATA_AS_OF: string (nullable = true)
 |-- LINK_ID: integer (nullable = true)
 |-- LINK_POINTS: string (nullable = true)
 |-- ENCODED_POLY_LINE: string (nullable = true)
 |-- ENCODED_POLY_LINE_LVLS: string (nullable = true)
 |-- OWNER: string (nullable = true)
 |-- TRANSCOM_ID: integer (nullable = true)
 |-- BOROUGH: string (nullable = true)
 |-- LINK_NAME: string (nullable = true)



                                                                                

In [4]:
from pyspark.sql.functions import to_timestamp
df1 = df1.withColumn("DATA_AS_OF", to_timestamp("DATA_AS_OF", 'MM/dd/yyyy hh:mm:ss a'))
df1 = df1.withColumnRenamed('DATA_AS_OF','timedate')
df1.createOrReplaceTempView('dataset')

In [5]:
df2 = spark.sql('select SPEED, travel_time, timedate,LINK_ID,hour(timedate) `hour`,minute(timedate) `minute`,weekday(timedate) `weekday`,year(timedate) `year`,month(timedate) `month`,day(timedate) `day`,dayofyear(timedate) as `day_of_year` from dataset')
df2.show()

+-----+-----------+-------------------+-------+----+------+-------+----+-----+---+-----------+
|SPEED|travel_time|           timedate|LINK_ID|hour|minute|weekday|year|month|day|day_of_year|
+-----+-----------+-------------------+-------+----+------+-------+----+-----+---+-----------+
|44.11|         48|2021-09-22 17:08:11|4616204|  17|     8|      2|2021|    9| 22|        265|
|11.18|        515|2021-09-22 17:08:11|4616206|  17|     8|      2|2021|    9| 22|        265|
| 5.59|        917|2021-09-22 17:08:11|4616210|  17|     8|      2|2021|    9| 22|        265|
|24.23|        117|2021-09-22 17:08:11|4616211|  17|     8|      2|2021|    9| 22|        265|
|53.43|        140|2021-09-22 17:08:11|4616215|  17|     8|      2|2021|    9| 22|        265|
|  0.0|          0|2021-09-22 17:08:11|4616216|  17|     8|      2|2021|    9| 22|        265|
|  0.0|          0|2021-09-22 17:08:11|4616217|  17|     8|      2|2021|    9| 22|        265|
|50.33|         42|2021-09-22 17:08:11|4616218|  1

In [6]:
assembler = VectorAssembler(inputCols=['SPEED','hour','minute','weekday','year','month','day','day_of_year'], outputCol='features')

In [7]:
df_out = assembler.transform(df2)

df_out.show(5)

+-----+-----------+-------------------+-------+----+------+-------+----+-----+---+-----------+--------------------+
|SPEED|travel_time|           timedate|LINK_ID|hour|minute|weekday|year|month|day|day_of_year|            features|
+-----+-----------+-------------------+-------+----+------+-------+----+-----+---+-----------+--------------------+
|44.11|         48|2021-09-22 17:08:11|4616204|  17|     8|      2|2021|    9| 22|        265|[44.11,17.0,8.0,2...|
|11.18|        515|2021-09-22 17:08:11|4616206|  17|     8|      2|2021|    9| 22|        265|[11.18,17.0,8.0,2...|
| 5.59|        917|2021-09-22 17:08:11|4616210|  17|     8|      2|2021|    9| 22|        265|[5.59,17.0,8.0,2....|
|24.23|        117|2021-09-22 17:08:11|4616211|  17|     8|      2|2021|    9| 22|        265|[24.23,17.0,8.0,2...|
|53.43|        140|2021-09-22 17:08:11|4616215|  17|     8|      2|2021|    9| 22|        265|[53.43,17.0,8.0,2...|
+-----+-----------+-------------------+-------+----+------+-------+----+

In [12]:
from pyspark.sql.functions import col
clean_df = df_out.select(['features', 'travel_time'])
clean_df = df_out.select(['features', col('travel_time').alias('label')])
clean_df.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[44.11,17.0,8.0,2...|   48|
|[11.18,17.0,8.0,2...|  515|
|[5.59,17.0,8.0,2....|  917|
|[24.23,17.0,8.0,2...|  117|
|[53.43,17.0,8.0,2...|  140|
|[0.0,17.0,8.0,2.0...|    0|
|[0.0,17.0,8.0,2.0...|    0|
|[50.33,17.0,8.0,2...|   42|
|[37.28,17.0,8.0,2...|   57|
|[0.0,17.0,8.0,2.0...|    0|
|[16.77,17.0,8.0,2...|  328|
|[16.77,17.0,8.0,2...|  318|
|[49.7,17.0,8.0,2....|   93|
|[49.7,17.0,8.0,2....|  150|
|[18.01,17.0,8.0,2...|  249|
|[31.68,17.0,8.0,2...|  174|
|[26.09,17.0,8.0,2...|   96|
|[14.29,17.0,8.0,2...|  607|
|[47.84,17.0,8.0,2...|   93|
|[47.84,17.0,8.0,2...|  156|
+--------------------+-----+
only showing top 20 rows



In [13]:
train, test = clean_df.randomSplit([0.7,0.3], seed= 41)

In [14]:
lr_model = LinearRegression(featuresCol='features',labelCol='label')

In [15]:
fit_model = lr_model.fit(train)

22/04/10 21:32:20 WARN Instrumentation: [c46dc89e] regParam is zero, which might cause numerical instability and overfitting.
22/04/10 21:32:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/04/10 21:32:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/04/10 21:34:06 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [16]:
test_results = fit_model.evaluate(test)

                                                                                

In [17]:
test_results.r2

0.01809471959488307

In [18]:
assembler1 = VectorAssembler(inputCols=['travel_time','hour','minute','weekday','year','month','day','day_of_year'], outputCol='features')
df_out1 = assembler1.transform(df2)

df_out1.show(5)
from pyspark.sql.functions import col
clean_df1 = df_out1.select(['features', 'speed'])
clean_df1 = df_out1.select(['features', col('speed').alias('label')])
clean_df1.show()
train1, test1 = clean_df1.randomSplit([0.7,0.3], seed= 41)
lr_model1 = LinearRegression(featuresCol='features',labelCol='label')
fit_model1 = lr_model1.fit(train1)
test_results1 = fit_model1.evaluate(test1)
test_results1.r2

+-----+-----------+-------------------+-------+----+------+-------+----+-----+---+-----------+--------------------+
|SPEED|travel_time|           timedate|LINK_ID|hour|minute|weekday|year|month|day|day_of_year|            features|
+-----+-----------+-------------------+-------+----+------+-------+----+-----+---+-----------+--------------------+
|44.11|         48|2021-09-22 17:08:11|4616204|  17|     8|      2|2021|    9| 22|        265|[48.0,17.0,8.0,2....|
|11.18|        515|2021-09-22 17:08:11|4616206|  17|     8|      2|2021|    9| 22|        265|[515.0,17.0,8.0,2...|
| 5.59|        917|2021-09-22 17:08:11|4616210|  17|     8|      2|2021|    9| 22|        265|[917.0,17.0,8.0,2...|
|24.23|        117|2021-09-22 17:08:11|4616211|  17|     8|      2|2021|    9| 22|        265|[117.0,17.0,8.0,2...|
|53.43|        140|2021-09-22 17:08:11|4616215|  17|     8|      2|2021|    9| 22|        265|[140.0,17.0,8.0,2...|
+-----+-----------+-------------------+-------+----+------+-------+----+

22/04/10 21:42:40 WARN Instrumentation: [65979abd] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

0.03312473743579136

In [1]:
spark.stop()

NameError: name 'spark' is not defined