In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path 
import missingno as msno
from tabulate import tabulate
from statistics import median, mean, quantiles
import pprint
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from datetime import datetime, timedelta
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import folium
plt.style.use('ggplot')
#pd.set_option('max_columns', 200)

In [2]:
# Création d'un DF pour afficher les stats des données
def tstats (data) :
    output = []
    for col in data.columns:
    
        nonNull  = len(data) - np.sum(pd.isna(data[col]))
        nonNullprop = (nonNull / len(data[col]))*100
        unique = data[col].nunique()
        colType = str(data[col].dtype)
        output.append([col, colType, nonNull, round(nonNullprop, 1) , unique])
            
    
    df_stats = pd.DataFrame(output)
    df_stats.columns = ['nom colonne','dtype', 'valeur non null',"% de non null", 'nb_unique']

    print(tabulate(df_stats, headers='keys', tablefmt='psql'))
    
    
def diff_in_hours (date1, date2) :
    result_hour = (date1 - date2) / pd.Timedelta('1 hour')
    return result_hour


def filter_brazil_data(longitudes, latitudes):
    filtered_longitudes = []
    filtered_latitudes = []
    for lon, lat in zip(longitudes, latitudes):
        if -74.5 <= lon <= -34.5 and -33.75 <= lat <= 5.5:
            filtered_longitudes.append(lon)
            filtered_latitudes.append(lat)
    return filtered_longitudes, filtered_latitudes

In [43]:
path = Path(os.getcwd()) 
p_parent = path.parent
p_customer = str(p_parent) + '\data\olist_customers_dataset.csv'
p_geo = str(p_parent) + '\data\olist_geolocation_dataset.csv'
p_order_item = str(p_parent) + '\data\olist_order_items_dataset.csv'
p_order_payment = str(p_parent) + '\data\olist_order_payments_dataset.csv'
p_order_review = str(p_parent) + '\data\olist_order_reviews_dataset.csv'
p_orders = str(p_parent) + '\data\olist_orders_dataset.csv'
p_products = str(p_parent) + '\data\olist_products_dataset.csv'
p_sellers = str(p_parent) + '\data\olist_sellers_dataset.csv'
p_category = str(p_parent) + '\data\product_category_name_translation_expanded.csv'

customer = pd.read_csv(p_customer)
geo = pd.read_csv(p_geo)
order_item = pd.read_csv(p_order_item)
order_payment = pd.read_csv(p_order_payment)
order_review = pd.read_csv(p_order_review)
orders = pd.read_csv(p_orders)
products = pd.read_csv(p_products)
sellers = pd.read_csv(p_sellers)
category = pd.read_csv(p_category)


#fusion avec .merge

df = orders.merge(customer, on="customer_id", how ='left')\
.merge(order_item, on= "order_id", how = 'left')\
.merge(order_payment, on ='order_id', how ='left')\
.merge(order_review, on ="order_id", how="left")\
.merge(sellers, on ='seller_id', how ="left")\
.merge(products, on = "product_id", how = 'left')\
.merge(category, on="product_category_name", how="left")

#les données géographiques sont trop volumineuse pour être integré dans le dataframe unique
#geo_seller = geo.rename(columns={"geolocation_zip_code_prefix":"seller_zip_code_prefix",
#                                 "geolocation_lat":"seller_lat",
#                                 "geolocation_lng":'seller_lng',
#                                 "geolocation_city": "seller_city",
#                                 "geolocation_state":"seller_state"})
#geo_customer = geo.rename(columns={"geolocation_zip_code_prefix":"customer_zip_code_prefix",
#                                   "geolocation_lat":"customer_lat",
#                                   "geolocation_lng":'customer_lng',
#                                   "geolocation_city": "customer_city",
#                                   "geolocation_state":"customer_state"})
#df = df.merge(geo_seller, on ="seller_zip_code_prefix", how ='left')
#df = df.merge(geo_customer, on ="customer_zip_code_prefix", how ='left')


#création d'un liste des nom de colonnes

cols = []

for col in df : 
    cols.append(col)

# transformation des colonnes date en format date

col_date = ['order_purchase_timestamp', 
            'order_approved_at', 
            'order_delivered_carrier_date', 
            'order_estimated_delivery_date',
            'review_creation_date',
            'review_answer_timestamp',
            'shipping_limit_date',
            'order_delivered_customer_date' 
           ]

# modification des type de colonne pour les zip code et les ID

df[col_date] = df[col_date].apply(pd.to_datetime)
df = df.astype({'customer_zip_code_prefix':'object',  'seller_zip_code_prefix' : 'object', "order_item_id": 'object'})

df= df.drop_duplicates()

tstats(df)

+----+-------------------------------+----------------+-------------------+-----------------+-------------+
|    | nom colonne                   | dtype          |   valeur non null |   % de non null |   nb_unique |
|----+-------------------------------+----------------+-------------------+-----------------+-------------|
|  0 | order_id                      | object         |            119143 |           100   |       99441 |
|  1 | customer_id                   | object         |            119143 |           100   |       99441 |
|  2 | order_status                  | object         |            119143 |           100   |           8 |
|  3 | order_purchase_timestamp      | datetime64[ns] |            119143 |           100   |       98875 |
|  4 | order_approved_at             | datetime64[ns] |            118966 |            99.9 |       90733 |
|  5 | order_delivered_carrier_date  | datetime64[ns] |            117057 |            98.2 |       81018 |
|  6 | order_delivered_custo

In [20]:
gr_category = (df.groupby(['product_category_name_english'], as_index=False).count())
print(gr_category)

   product_category_name_english  order_id  customer_id  order_status  \
0     agro_industry_and_commerce       252          252           252   
1               air_conditioning       302          302           302   
2                            art       219          219           219   
3          arts_and_craftmanship        24           24            24   
4                          audio       381          381           381   
..                           ...       ...          ...           ...   
66                    stationery      2625         2625          2625   
67        tablets_printing_image        87           87            87   
68                     telephony      4726         4726          4726   
69                          toys      4281         4281          4281   
70                 watches_gifts      6213         6213          6213   

    order_purchase_timestamp  order_approved_at  order_delivered_carrier_date  \
0                        252              