### Objective

To import the Amazon dataset into the spark environment

Do minimum neccary Data analysis of the data

Create features, labels dataset using the pipeline, including the transformer, estimator and finally evaluator

Write the final model and the prediction to the database

In [1]:
#Starting with import of pyspark and related modules

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import warnings
warnings.filterwarnings("ignore")

from pyspark.ml import *

In [2]:
mllibPath = "mllib/"
externalData = "externalData/"

In [3]:
import shutil

shutil.unpack_archive(externalData+"amazon-business-research-analyst-dataset.zip",extract_dir=externalData)

In [3]:
from pyspark.sql import SparkSession
    
sparkSQL = SparkSession.builder.appName('amazonRA') \
        .config('spark.jars',"/usr/share/java/postgresql-42.2.26.jar") \
        .getOrCreate()

22/11/29 05:17:15 WARN Utils: Your hostname, codeStation resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
22/11/29 05:17:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/11/29 05:17:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
sparkReader = sparkSQL.read

In [5]:
sparkContext = sparkSQL.sparkContext

In [6]:
%%sh
cd externalData/
ls

aidData.csv
amazon-business-research-analyst-dataset.zip
athletes.csv
bls-industry-unemployment.csv
bls-metro-unemployment.csv
bseScripts.json
cleaned_test.csv
Common_FMCG_Labelled_Quarter_Results.csv
companyManagement.csv
companyMarketData.csv
Electricity generation.csv
encoded_cleaned_test.csv
googleMerchPurchases.csv
Laser_Report_2020.xlsx
nyc-collisions-2019-reduced.csv
protests.csv
purchase_data.csv
Quora_answers.csv
selected-indicators-from-world-bank-20002019.zip
skillshare-top-1000-course.zip
stocks.csv
top-pypi-packages-30-days.json
unemployment.csv
updated.csv
us-congress-members.csv
vizheads.csv


In [17]:
cleaned_RA = sparkReader.csv(externalData+"cleaned_test.csv",
                                  inferSchema=True,
                                  header=True,
                                 sep=',')

In [18]:
cleaned_RA.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Delivery_person_ID: string (nullable = true)
 |-- Delivery_person_Age: double (nullable = true)
 |-- Delivery_person_Ratings: double (nullable = true)
 |-- Restaurant_latitude: double (nullable = true)
 |-- Restaurant_longitude: double (nullable = true)
 |-- Delivery_location_latitude: double (nullable = true)
 |-- Delivery_location_longitude: double (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Time_Orderd: string (nullable = true)
 |-- Time_Order_picked: string (nullable = true)
 |-- Weather: string (nullable = true)
 |-- Road_traffic_density: string (nullable = true)
 |-- Vehicle_condition: integer (nullable = true)
 |-- Type_of_order: string (nullable = true)
 |-- Type_of_vehicle: string (nullable = true)
 |-- multiple_deliveries: double (nullable = true)
 |-- Festival: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Name:: string (nullable = true)



In [26]:
deliveryPersonFact = cleaned_RA.select("Delivery_person_ID","Delivery_person_Age",
                 "Delivery_person_Ratings","Type_of_Vehicle")

In [68]:
ordersFact = cleaned_RA.select("Restaurant_latitude","Restaurant_longitude",
                              "Delivery_location_latitude","Delivery_location_longitude",
                              "Order_Date","Time_Orderd","Time_Order_picked","Type_of_order",
                              "multiple_deliveries","Road_traffic_density","Weather")

In [61]:
ordersFact.select("Type_of_order","Road_traffic_density","Weather") \
            .show()

+-------------+--------------------+-------+
|Type_of_order|Road_traffic_density|Weather|
+-------------+--------------------+-------+
|       Drinks|                 NaN|    NaN|
|        Snack|                 Jam|  Windy|
|       Drinks|                 Jam| Stormy|
|         Meal|              Medium|    Fog|
|       Drinks|              Medium|  Sunny|
|       Drinks|                 Low|    Fog|
|       Buffet|                 Low|  Windy|
|         Meal|              Medium|  Windy|
|        Snack|                 Jam| Cloudy|
|         Meal|                 Jam|    Fog|
|        Snack|                High| Stormy|
|       Drinks|                 Low|  Windy|
|        Snack|                 Low|  Sunny|
|       Drinks|                 Low|  Windy|
|         Meal|                 Jam|    Fog|
|        Snack|                 Jam| Cloudy|
|       Buffet|                 Jam|  Windy|
|        Snack|                 Jam|  Sunny|
|        Snack|              Medium| Cloudy|
|       Dr

### Objective

To learn about the various transformers available in pyspark and implement a couple in Amazon analysis

Learn about the multiple ML algorithms, and use atleast Regression, Classification and Clustering on Amazon RA data

In [27]:
deliveryPersonFact.show(2)

+------------------+-------------------+-----------------------+----------------+
|Delivery_person_ID|Delivery_person_Age|Delivery_person_Ratings| Type_of_Vehicle|
+------------------+-------------------+-----------------------+----------------+
|   COIMBRES13DEL01|                NaN|                    NaN|electric_scooter|
|    BANGRES15DEL01|               28.0|                    4.6|      motorcycle|
+------------------+-------------------+-----------------------+----------------+
only showing top 2 rows



In [35]:
#lets first use the imputer to update the Null values.
from pyspark.ml.feature import Imputer

deliveryImputer = Imputer(inputCols=["Delivery_person_Age",
                                     "Delivery_person_Ratings"],
                         outputCols=["DeliveryPersonAge",
                                    "DeliveryPersonRatings"],
                         strategy="mean")

deliveryModel = deliveryImputer.fit(deliveryPersonFact)

In [45]:
deliveryPersonNullRemoved = deliveryModel.transform(deliveryPersonFact). \
                drop("Delivery_person_Age","Delivery_person_Ratings")

In [46]:
deliveryPersonNullRemoved = deliveryPersonNullRemoved.select("Delivery_person_ID","Type_of_Vehicle",
                                round("DeliveryPersonAge",1).alias("DeliveryPersonAge"),
                                round("DeliveryPersonRatings",1).alias("DeliveryPersonRatings"))

In [47]:
deliveryPersonNullRemoved.show(5)

+------------------+----------------+-----------------+---------------------+
|Delivery_person_ID| Type_of_Vehicle|DeliveryPersonAge|DeliveryPersonRatings|
+------------------+----------------+-----------------+---------------------+
|   COIMBRES13DEL01|electric_scooter|             29.5|                  4.6|
|    BANGRES15DEL01|      motorcycle|             28.0|                  4.6|
|     JAPRES09DEL03|      motorcycle|             23.0|                  4.5|
|     JAPRES07DEL03|         scooter|             21.0|                  4.8|
|    CHENRES19DEL01|         scooter|             31.0|                  4.6|
+------------------+----------------+-----------------+---------------------+
only showing top 5 rows



In [51]:
#Next we will work on encoding the type of vehicles
deliveryPersonNullRemoved.select("Type_of_vehicle").distinct().show()

+----------------+
| Type_of_vehicle|
+----------------+
|      motorcycle|
|         scooter|
|electric_scooter|
|         bicycle|
+----------------+



In [56]:
#There are three modules to use, OneHotEncoder, VectorIndexer, StringIndexer
from pyspark.ml.feature import OneHotEncoder, VectorIndexer, StringIndexer
stringIndexModel = StringIndexer(inputCol="Type_of_Vehicle",outputCol="EncodedCategories")
delivery_stringIndexed = stringIndexModel.fit(deliveryPersonNullRemoved).transform(deliveryPersonNullRemoved)
delivery_stringIndexed.show(2)

                                                                                

+------------------+----------------+-----------------+---------------------+-----------------+
|Delivery_person_ID| Type_of_Vehicle|DeliveryPersonAge|DeliveryPersonRatings|EncodedCategories|
+------------------+----------------+-----------------+---------------------+-----------------+
|   COIMBRES13DEL01|electric_scooter|             29.5|                  4.6|              2.0|
|    BANGRES15DEL01|      motorcycle|             28.0|                  4.6|              0.0|
+------------------+----------------+-----------------+---------------------+-----------------+
only showing top 2 rows



In [57]:
delivery_stringIndexed.select("EncodedCategories").distinct().show()

+-----------------+
|EncodedCategories|
+-----------------+
|              0.0|
|              1.0|
|              3.0|
|              2.0|
+-----------------+



In [71]:
stringIndexOrder = StringIndexer(inputCols=["Weather","Road_traffic_density",
                                            "Type_of_order","multiple_deliveries"]
                                 ,outputCols=["Weather_idx","Road_traffic_density_idx",
                                            "Type_of_order_idx","multiple_deliveries_idx"])
order_stringIndexed = stringIndexOrder.fit(ordersFact).transform(ordersFact)

In [74]:
order_stringIndexed = order_stringIndexed.drop("Weather","Road_traffic_density",
                                            "Type_of_order","multiple_deliveries")

In [86]:
help(cast)

Help on function cast in module typing:

cast(typ, val)
    Cast a value to a type.
    
    This returns the value unchanged.  To the type checker this
    signals that the return value has the designated type, but at
    runtime we intentionally don't check anything (we want this
    to be as fast as possible).



In [89]:
order_stringIndexed.select("Time_Order_picked","Time_orderd",
                           col("Time_Order_picked").cast("timestamp") - col("Time_orderd").cast("timestamp")).show(5)

+-----------------+-----------+-----------------------------------------------------------------------+
|Time_Order_picked|Time_orderd|(CAST(Time_Order_picked AS TIMESTAMP) - CAST(Time_orderd AS TIMESTAMP))|
+-----------------+-----------+-----------------------------------------------------------------------+
|            15:05|        NaN|                                                                   null|
|            20:35|      20:30|                                                   INTERVAL '0 00:05...|
|            19:45|      19:35|                                                   INTERVAL '0 00:10...|
|            17:20|      17:15|                                                   INTERVAL '0 00:05...|
|            18:40|      18:25|                                                   INTERVAL '0 00:15...|
+-----------------+-----------+-----------------------------------------------------------------------+
only showing top 5 rows



In [81]:
order_stringIndexed. \
        select(date_format("Time_orderd",'HH:MM').alias("timeOrderFormated"),
              date_format("Time_Order_picked",'HH:MM').alias("OrderPickedFormated")).show()

+-----------------+-------------------+
|timeOrderFormated|OrderPickedFormated|
+-----------------+-------------------+
|             null|              15:11|
|            20:11|              20:11|
|            19:11|              19:11|
|            17:11|              17:11|
|            18:11|              18:11|
|            09:11|              09:11|
|             null|              10:11|
|             null|              18:11|
|            21:11|              21:11|
|            20:11|              20:11|
|            14:11|              14:11|
|            23:11|              23:11|
|            22:11|              22:11|
|            23:11|              23:11|
|            21:11|              21:11|
|            20:11|              20:11|
|            20:11|              20:11|
|            21:11|              22:11|
|            18:11|              18:11|
|            21:11|              21:11|
+-----------------+-------------------+
only showing top 20 rows



In [9]:
def writingCSVFiletoDatabase(session, csvFile,dbName,dbTableName):
    
    fileSparkDF = session.read.csv(csvFile,inferSchema=True,header=True)
    try:
        fileSparkDF.write \
                    .format('jdbc') \
                    .option("url", f"jdbc:postgresql://localhost:5432/{dbName}") \
                    .option('dbtable', dbTableName) \
                    .option('user','postgres') \
                    .option('password', 1234) \
                    .option('driver','org.postgresql.Driver') \
                    .save(mode='overwrite')
        print('Write Complete')
    except Exception as e:
        print(f'Write errored out due to {e}')
    

In [10]:
def writingSparkDFtoDatabase(session,sparkDF,dbName,dbTableName):
    
    try:
        sparkDF.write \
                    .format('jdbc') \
                    .option("url", f"jdbc:postgresql://localhost:5432/{dbName}") \
                    .option('dbtable', dbTableName) \
                    .option('user','postgres') \
                    .option('password', 1234) \
                    .option('driver','org.postgresql.Driver') \
                    .save(mode='overwrite')
        print('Write Complete')
    except Exception as e:
        print(f'Write errored out due to {e}')
    

                                                                                

Write Complete
