# Calculating the distance between the Customer's city and the Seller's city

In [1]:
from pyspark.sql import SparkSession, functions as F
import math

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
orders_items_df = spark.read \
                 .option('escape', '\"') \
                 .option('quote', '\"') \
                 .csv('./dataset/olist_order_items_dataset.csv', header=True, multiLine=True, inferSchema=True)

orders_df = spark.read \
                 .option('escape', '\"') \
                 .option('quote', '\"') \
                 .csv('./dataset/olist_orders_dataset.csv', header=True, multiLine=True, inferSchema=True)

customers_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_customers_dataset.csv', header=True, multiLine=True, inferSchema=True)

sellers_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_sellers_dataset.csv', header=True, multiLine=True, inferSchema=True)



In [4]:
geo_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_geolocation_dataset.csv', header=True, multiLine=True, inferSchema=True)

# Grouping data

In [5]:
data_df = orders_df.filter(F.col('order_status') == 'delivered').join(customers_df, 'customer_id')

data_df = orders_items_df.join(data_df, 'order_id') \
                         .join(sellers_df, 'seller_id') \
                         .select('customer_state', 'customer_city', 'customer_zip_code_prefix', 'seller_zip_code_prefix', 'freight_value')

geo_df = geo_df.groupBy('geolocation_zip_code_prefix').agg(F.min('geolocation_lat').alias('geolocation_lat'), F.min('geolocation_lng').alias('geolocation_lng'))

data_df = data_df.join(geo_df, data_df.customer_zip_code_prefix == geo_df.geolocation_zip_code_prefix) \
                 .select(F.col('geolocation_lat').alias('customer_lat'), F.col('geolocation_lng').alias('customer_lng'), 'seller_zip_code_prefix', 'freight_value') \
                 .join(geo_df, data_df.seller_zip_code_prefix == geo_df.geolocation_zip_code_prefix) \
                 .select('customer_lat', 'customer_lng', F.col('geolocation_lat').alias('seller_lat'), F.col('geolocation_lng').alias('seller_lng'),'freight_value')
data_df.count()


109661

In [6]:
data_df.show()

+-------------------+-------------------+-------------------+-------------------+-------------+
|       customer_lat|       customer_lng|         seller_lat|         seller_lng|freight_value|
+-------------------+-------------------+-------------------+-------------------+-------------+
| -23.50648246805157|-47.422068081741564|-23.545262137111173| -46.66134804356862|        14.43|
| -23.82558722913311| -46.56982049999999| -23.51441473688614| -46.59097058895492|         9.34|
|-21.213665497085813| -47.81670447259758| -23.51441473688614| -46.59097058895492|        11.74|
|-21.445954952757404| -50.12641249999996| -23.51441473688614| -46.59097058895492|         3.07|
|-21.445954952757404| -50.12641249999996| -23.51441473688614| -46.59097058895492|         3.06|
|-23.635655999999997|   -46.751535578894| -23.51441473688614| -46.59097058895492|         9.34|
| -23.49878075214959|-46.632511331380975| -23.51441473688614| -46.59097058895492|         9.34|
|-22.970853233039268|-43.671131559512865

# Calculating distance

In [7]:
def d(c_lat, c_lng, s_lat, s_lng):
    radius = 6371 # km

    dlat = math.radians(s_lat-c_lat)
    dlon = math.radians(s_lng-c_lng)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(c_lat)) \
        * math.cos(math.radians(s_lat)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c;

    return d

distance = F.udf(d)

data_df = data_df.withColumn('distance', distance('customer_lat', 'customer_lng', 'seller_lat', 'seller_lng'))

data_df.show()

+-------------------+-------------------+-------------------+-------------------+-------------+------------------+
|       customer_lat|       customer_lng|         seller_lat|         seller_lng|freight_value|          distance|
+-------------------+-------------------+-------------------+-------------------+-------------+------------------+
| -23.50648246805157|-47.422068081741564|-23.545262137111173| -46.66134804356862|        14.43| 77.67691920579136|
| -23.82558722913311| -46.56982049999999| -23.51441473688614| -46.59097058895492|         9.34|34.667779619845135|
|-21.213665497085813| -47.81670447259758| -23.51441473688614| -46.59097058895492|        11.74| 285.1904555969043|
|-21.445954952757404| -50.12641249999996| -23.51441473688614| -46.59097058895492|         3.07| 429.9117657164478|
|-21.445954952757404| -50.12641249999996| -23.51441473688614| -46.59097058895492|         3.06| 429.9117657164478|
|-23.635655999999997|   -46.751535578894| -23.51441473688614| -46.59097058895492

In [8]:
data_df = data_df.withColumn('distance', F.col('distance').cast('double'))

In [9]:
data_df.printSchema()

root
 |-- customer_lat: double (nullable = true)
 |-- customer_lng: double (nullable = true)
 |-- seller_lat: double (nullable = true)
 |-- seller_lng: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- distance: double (nullable = true)



In [10]:
data_df.stat.corr('distance','freight_value')

0.3894926307458334