In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path 
import missingno as msno
from tabulate import tabulate
from statistics import median, mean, quantiles
import pprint
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from datetime import datetime, timedelta
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import geopy.distance
import folium
import geopy.distance
plt.style.use('ggplot')
#pd.set_option('max_columns', 200)

In [2]:
# Création d'un DF pour afficher les stats des données
def tstats (data) :
    output = []
    for col in data.columns:
    
        nonNull  = len(data) - np.sum(pd.isna(data[col]))
        nonNullprop = (nonNull / len(data[col]))*100
        unique = data[col].nunique()
        colType = str(data[col].dtype)
        output.append([col, colType, nonNull, round(nonNullprop, 1) , unique])
            
    
    df_stats = pd.DataFrame(output)
    df_stats.columns = ['nom colonne','dtype', 'valeur non null',"% de non null", 'nb_unique']

    print(tabulate(df_stats, headers='keys', tablefmt='psql'))
    
    
def diff_in_hours (date1, date2) :
    result_hour = (date1 - date2) / pd.Timedelta('1 hour')
    return result_hour


def filter_geo_brazil_lat(latitudes):
    filtered_latitudes = []
    default_lat = sum(latitudes) / len(latitudes)
    for lat in latitudes:
         if -33.75 <= lat <= 5.5:
            filtered_latitudes.append(lat)
         else : 
            filtered_latitudes.append(default_lat)            
    return filtered_latitudes


def filter_geo_brazil_lng(longitudes):
    filtered_longitudes = []
    default_lng = sum(longitudes) / len(longitudes)
    for lon in longitudes:
        if -74.5 <= lon <= -34.5:
            filtered_longitudes.append(lon)
        else : 
            filtered_longitudes.append(default_lng)
    return filtered_longitudes

In [22]:
path = Path(os.getcwd()) 
p_parent = path.parent
p_customer = str(p_parent) + '\data\olist_customers_dataset.csv'
p_geo = str(p_parent) + '\data\olist_geolocation_dataset.csv'
p_order_item = str(p_parent) + '\data\olist_order_items_dataset.csv'
p_order_payment = str(p_parent) + '\data\olist_order_payments_dataset.csv'
p_order_review = str(p_parent) + '\data\olist_order_reviews_dataset.csv'
p_orders = str(p_parent) + '\data\olist_orders_dataset.csv'
p_products = str(p_parent) + '\data\olist_products_dataset.csv'
p_sellers = str(p_parent) + '\data\olist_sellers_dataset.csv'
p_category = str(p_parent) + '\data\product_category_name_translation_expanded.csv'

customer = pd.read_csv(p_customer)
geo = pd.read_csv(p_geo)
order_item = pd.read_csv(p_order_item)
order_payment = pd.read_csv(p_order_payment)
order_review = pd.read_csv(p_order_review)
orders = pd.read_csv(p_orders)
products = pd.read_csv(p_products)
sellers = pd.read_csv(p_sellers)
category = pd.read_csv(p_category)


#fusion avec .merge

df = orders.merge(customer, on="customer_id", how ='left')\
.merge(order_item, on= "order_id", how = 'left')\
.merge(order_payment, on ='order_id', how ='left')\
.merge(order_review, on ="order_id", how="left")\
.merge(sellers, on ='seller_id', how ="left")\
.merge(products, on = "product_id", how = 'left')\
.merge(category, on="product_category_name", how="left")

In [23]:
tstats(df)

+----+-------------------------------+---------+-------------------+-----------------+-------------+
|    | nom colonne                   | dtype   |   valeur non null |   % de non null |   nb_unique |
|----+-------------------------------+---------+-------------------+-----------------+-------------|
|  0 | order_id                      | object  |            119143 |           100   |       99441 |
|  1 | customer_id                   | object  |            119143 |           100   |       99441 |
|  2 | order_status                  | object  |            119143 |           100   |           8 |
|  3 | order_purchase_timestamp      | object  |            119143 |           100   |       98875 |
|  4 | order_approved_at             | object  |            118966 |            99.9 |       90733 |
|  5 | order_delivered_carrier_date  | object  |            117057 |            98.2 |       81018 |
|  6 | order_delivered_customer_date | object  |            115722 |            97.1 |     

In [24]:
# on va créer un autre df pour traiter ces données :

# création de la base des vendeurs :
df_geo_seller = sellers.merge(geo, left_on ="seller_zip_code_prefix", right_on ='geolocation_zip_code_prefix', how ='inner')
#filtre des données hors brezil : 
df_geo_seller['geolocation_lat'] = filter_geo_brazil_lat(df_geo_seller['geolocation_lat'])
df_geo_seller['geolocation_lng'] = filter_geo_brazil_lng(df_geo_seller['geolocation_lng'])
#regroupement des données par vendeur : 
gr_geo_seller = df_geo_seller.groupby(['seller_id'], as_index=False).aggregate({
                'geolocation_lat':'mean',
                'geolocation_lng' : 'mean',
                }).rename(columns={"geolocation_lat": "seller_lat",
                            "geolocation_lng": "seller_lng",
                }) 

# création de la base des clients :
df_geo_customer = customer.merge(geo, left_on ="customer_zip_code_prefix", right_on ='geolocation_zip_code_prefix', how ='inner')
#filtre des données hors brezil :  
df_geo_customer['geolocation_lat'] = filter_geo_brazil_lat(df_geo_customer['geolocation_lat'])
df_geo_customer['geolocation_lng'] = filter_geo_brazil_lng(df_geo_customer['geolocation_lng'])
#regroupement des données par client unique : 
gr_geo_customer = df_geo_customer.groupby(['customer_unique_id'], as_index=False).aggregate({
                'geolocation_lat':'mean',
                'geolocation_lng' : 'mean',
                }).rename(columns={"geolocation_lat": "customer_lat",
                            "geolocation_lng": "customer_lng",
                })


df = df.merge(gr_geo_seller, left_on ="seller_id", right_on ='seller_id', how ='left')\
.merge(gr_geo_customer, left_on ="customer_unique_id", right_on ='customer_unique_id', how ='left')

In [26]:
for label, row in df.iterrows() : 
    if pd.notna(row['seller_lat']) & pd.notna(row['seller_lng']) & pd.notna(row['customer_lat']) & pd.notna(row['customer_lng']):
        customer_location = [row['seller_lat'], row['seller_lng']]
        seller_location = [row['customer_lat'], row['customer_lng']]
        df.loc [label, "delivery_distance"] = geopy.distance.geodesic(customer_location, seller_location).km

In [27]:
tstats(df)

+----+-------------------------------+---------+-------------------+-----------------+-------------+
|    | nom colonne                   | dtype   |   valeur non null |   % de non null |   nb_unique |
|----+-------------------------------+---------+-------------------+-----------------+-------------|
|  0 | order_id                      | object  |            119143 |           100   |       99441 |
|  1 | customer_id                   | object  |            119143 |           100   |       99441 |
|  2 | order_status                  | object  |            119143 |           100   |           8 |
|  3 | order_purchase_timestamp      | object  |            119143 |           100   |       98875 |
|  4 | order_approved_at             | object  |            118966 |            99.9 |       90733 |
|  5 | order_delivered_carrier_date  | object  |            117057 |            98.2 |       81018 |
|  6 | order_delivered_customer_date | object  |            115722 |            97.1 |     