In [1]:
# Geographic Visualizations

In [2]:
## 1. Import libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json
import plotly.express as px


In [4]:
# This command propts matplotlib visuals to appear in the notebook 

%matplotlib inline

In [5]:
#Import the database
path = r'C:\Users\Mary\Desktop\Data Analyst CF\Part 6'

In [6]:
df = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'dfcorrelations.csv'), index_col = 0)

In [7]:
df.shape

(112657, 46)

In [8]:
df.head(3)

Unnamed: 0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,order_id,order_item_id,...,seller_city,seller_state,product_category_name,review_category,freight_flag,pur_del,start_date,end_date,n_of_days,distance
0,1e9e8ef04dbcff4541ed26657ea517e5,40.0,287.0,1.0,225.0,16.0,10.0,14.0,e17e4f88e31525f7deef66779844ddce,1,...,mogi guaçu,SP,perfumery,Good review,Low freight,3 days 00:25:24,2018-04-24,2018-04-27,3,2399.699757
1,a035b83b3628decee6e3823924e0c10f,53.0,2235.0,3.0,1450.0,20.0,25.0,20.0,b18cb761efbe70da4838435a349abd07,1,...,jaguariaíva,PR,perfumery,Good review,Medium freight,6 days 03:55:11,2018-07-04,2018-07-10,6,2662.455144
2,091107484dd7172f5dcfed173e4a960e,50.0,260.0,2.0,183.0,16.0,8.0,13.0,a7708ffa8966514c098d15e1abfa6417,1,...,sao paulo,SP,perfumery,Good review,Low freight,2 days 03:59:04,2018-04-23,2018-04-25,2,2340.750167


In [9]:
# Import ".json" file for Brazil 

br_geo = r'C:\Users\Mary\Desktop\Data Analyst CF/brazil.json'

In [10]:
br_geo

'C:\\Users\\Mary\\Desktop\\Data Analyst CF/brazil.json'

In [11]:
# That's just in case you want to look at the JSON file contents here too:

f = open(r'C:\Users\Mary\Desktop\Data Analyst CF/brazil.json',)
  
# returns JSON object asa dictionary
data = json.load(f)
  
# Iterating through the json list
for i in data['features']:
    print(i)

{'type': 'Feature', 'id': 'ys298mq8577.1', 'geometry': {'type': 'MultiPolygon', 'coordinates': [[[[-62.86617706, -7.9750512], [-62.85292605, -7.98674752], [-62.84461867, -7.98571404], [-62.84117678, -7.99415537], [-62.83239631, -7.99355264], [-62.80956184, -8.02451752], [-62.78575951, -8.02552274], [-62.74353683, -8.0444636], [-62.71239497, -8.07975275], [-62.69183919, -8.09208541], [-62.69011607, -8.10007375], [-62.67839719, -8.1106178], [-62.68506392, -8.11560882], [-62.6725898, -8.14448237], [-62.68481212, -8.16488912], [-62.68256382, -8.17973576], [-62.67308888, -8.19190731], [-62.65945158, -8.19880087], [-62.64950786, -8.2361879], [-62.60992409, -8.25896695], [-62.59961581, -8.27140216], [-62.58494986, -8.27104084], [-62.56878214, -8.28643702], [-62.56147738, -8.2831679], [-62.55550507, -8.30591932], [-62.56072647, -8.3129968], [-62.55483665, -8.3185493], [-62.54867336, -8.34130073], [-62.55108656, -8.35735498], [-62.53910727, -8.3572698], [-62.52503159, -8.37450669], [-62.5256999

In [12]:
## 2. Data Wrangling
# Create a subset from the original table with information from the states: nº of sellers, nº of buyers, nº pedidos, avg_review_score

In [13]:
#Group by two keys and then summarize each group
states =  df.groupby('seller_state',as_index=False).agg({'seller_id':'count', 'customer_unique_id': 'count', 'order_id':'count', 'review_score':'mean', 'n_of_days': 'mean'})

In [14]:
states.head(5)

Unnamed: 0,seller_state,seller_id,customer_unique_id,order_id,review_score,n_of_days
0,AC,91,91,91,4.098901,20.615385
1,AL,437,437,437,3.810069,24.478261
2,AM,167,167,167,4.113772,26.323353
3,AP,82,82,82,4.280488,28.085366
4,BA,3823,3823,3823,3.866074,19.080042


In [15]:
#Rename the columns: 
states.rename(columns={'seller_state':'States','seller_id':'n_of_sellers', 
                       'customer_unique_id':'n_of_customers', 'order_id': 'number_of_orders', 
                       'review_score': 'avg_review_score', 'n_of_days': 'avg_days_deliver'})

Unnamed: 0,States,n_of_sellers,n_of_customers,number_of_orders,avg_review_score,avg_days_deliver
0,AC,91,91,91,4.098901,20.615385
1,AL,437,437,437,3.810069,24.478261
2,AM,167,167,167,4.113772,26.323353
3,AP,82,82,82,4.280488,28.085366
4,BA,3823,3823,3823,3.866074,19.080042
5,CE,1466,1466,1466,3.899045,20.652115
6,DF,2208,2208,2208,4.034873,12.900815
7,ES,2253,2253,2253,4.006658,15.408344
8,GO,2292,2292,2292,4.029668,15.202443
9,MA,803,803,803,3.749689,21.418431


In [16]:
states.shape

(27, 6)

In [17]:
# Create a data frame with just the states and the values for rating we want plotted

data_to_plot = states[['seller_state', 'review_score']]
data_to_plot.head()

Unnamed: 0,seller_state,review_score
0,AC,4.098901
1,AL,3.810069
2,AM,4.113772
3,AP,4.280488
4,BA,3.866074


In [18]:
states.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 26
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   seller_state        27 non-null     object 
 1   seller_id           27 non-null     int64  
 2   customer_unique_id  27 non-null     int64  
 3   order_id            27 non-null     int64  
 4   review_score        27 non-null     float64
 5   n_of_days           27 non-null     float64
dtypes: float64(2), int64(3), object(1)
memory usage: 1.5+ KB


In [19]:
# Change the abbreviations to names of states 
states = states.replace({ 'AC' : 'Acre', 'AL' : 'Alagoas', 'AM': 'Amazonas', 'AP': 'AmapÃ¡', 
                     'BA': 'Bahia', 'CE': 'CearÃ¡', 'DF': 'Distrito Federal', 'ES': 'EspÃ\xadrito Santo', 
                     'GO': 'GoiÃ¡s', 'MA': 'MaranhÃ£o', 'MG': 'Minas Gerais', 'MS': 'Mato Grosso do Sul', 
                     'MT': 'Mato Grosso', 'PA': 'ParÃ¡', 'PB': 'ParaÃ\xadba', 'PE': 'Pernambuco', 'PI': 'PiauÃ\xad', 
                     'PR': 'ParanÃ¡', 'RJ': 'Rio de Janeiro', 'RN': 'Rio Grande do Norte', 'RO': 'RondÃ´nia', 'RR': 
                     'Roraima', 'RS': 'Rio Grande do Sul', 'SC': 'Santa Catarina', 'SE': 'Sergipe', 'SP': 'SÃ£o Paulo',
                    'TO': 'Tocantins'})

In [20]:
states

Unnamed: 0,seller_state,seller_id,customer_unique_id,order_id,review_score,n_of_days
0,Acre,91,91,91,4.098901,20.615385
1,Alagoas,437,437,437,3.810069,24.478261
2,Amazonas,167,167,167,4.113772,26.323353
3,AmapÃ¡,82,82,82,4.280488,28.085366
4,Bahia,3823,3823,3823,3.866074,19.080042
5,CearÃ¡,1466,1466,1466,3.899045,20.652115
6,Distrito Federal,2208,2208,2208,4.034873,12.900815
7,EspÃ­rito Santo,2253,2253,2253,4.006658,15.408344
8,GoiÃ¡s,2292,2292,2292,4.029668,15.202443
9,MaranhÃ£o,803,803,803,3.749689,21.418431


In [21]:
# Rename seller_state column

states.rename(columns = {'seller_state' : 'nome'}, inplace = True)
states.rename(columns = {'seller_id' : 'n_sellers'}, inplace = True)
states.rename(columns = {'customer_unique_id' : 'n_customers'}, inplace = True)
states.rename(columns = {'order_id' : 'n_orders'}, inplace = True)
states.rename(columns = {'review_score' : 'avg_review_score'}, inplace = True)
states.rename(columns = {'n_of_days' : 'total_n_of_days'}, inplace = True)


In [22]:
### 4. Plotting a choropleth

In [23]:
# Create a data frame with just the states and the values for rating we want plotted

data_to_plot = states[['nome','avg_review_score']]
data_to_plot.head()

Unnamed: 0,nome,avg_review_score
0,Acre,4.098901
1,Alagoas,3.810069
2,Amazonas,4.113772
3,AmapÃ¡,4.280488
4,Bahia,3.866074


In [24]:
# Setup a folium map at a high-level zoom
map = folium.Map(location = [100, 0], zoom_start = 1.5)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = br_geo, 
    data = data_to_plot,
    columns = ['nome', 'avg_review_score'],
    key_on = 'feature.properties.nome', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlGn', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "rating").add_to(map)
folium.LayerControl().add_to(map)

map

In [25]:
# Accordingly to the correlation, distance/n_of_days to deliver have a weak negative correlation with the avg_review_score. 
# Seems that the region Northeast may have the smallest rating. 
# It will lead to further investigation. Where I'll separate in regions and check again.

In [26]:
states['regions'] = ['North',
'Northeast',
'North',
'North',
'Northeast',
'Northeast',
'Middle West',
'Southeast',
'Middle West',
'Northeast',
'Southeast',
'Middle West',
'Middle West',
'North',
'Northeast',
'Northeast',
'Northeast',
'South',
'Southeast',
'Northeast',
'North',
'North',
'South',
'South',
'Northeast',
'Southeast',
'North'
]

In [27]:
states

Unnamed: 0,nome,n_sellers,n_customers,n_orders,avg_review_score,total_n_of_days,regions
0,Acre,91,91,91,4.098901,20.615385,North
1,Alagoas,437,437,437,3.810069,24.478261,Northeast
2,Amazonas,167,167,167,4.113772,26.323353,North
3,AmapÃ¡,82,82,82,4.280488,28.085366,North
4,Bahia,3823,3823,3823,3.866074,19.080042,Northeast
5,CearÃ¡,1466,1466,1466,3.899045,20.652115,Northeast
6,Distrito Federal,2208,2208,2208,4.034873,12.900815,Middle West
7,EspÃ­rito Santo,2253,2253,2253,4.006658,15.408344,Southeast
8,GoiÃ¡s,2292,2292,2292,4.029668,15.202443,Middle West
9,MaranhÃ£o,803,803,803,3.749689,21.418431,Northeast


In [28]:
# Aggregate in Regions
regions =  states.groupby('regions',as_index=False).agg({'nome': 'count', 'n_sellers': 'sum', 'n_customers': 'sum', 'n_orders':'sum', 'avg_review_score':'mean', 'total_n_of_days': 'mean'})

regions

Unnamed: 0,regions,nome,n_sellers,n_customers,n_orders,avg_review_score,total_n_of_days
0,Middle West,4,6389,6389,6389,4.043183,15.329049
1,North,7,2037,2037,2037,4.067742,23.367388
2,Northeast,9,10388,10388,10388,3.918487,20.473327
3,South,3,16190,16190,16190,4.090629,13.932641
4,Southeast,4,77653,77653,77653,4.046335,12.76089


In [29]:
regions.rename(columns = {'total_n_of_days' : 'avg_days_deliver'}, inplace = True)
regions.rename(columns = {'nome' : 'n_states'}, inplace = True)


In [30]:
regions

Unnamed: 0,regions,n_states,n_sellers,n_customers,n_orders,avg_review_score,avg_days_deliver
0,Middle West,4,6389,6389,6389,4.043183,15.329049
1,North,7,2037,2037,2037,4.067742,23.367388
2,Northeast,9,10388,10388,10388,3.918487,20.473327
3,South,3,16190,16190,16190,4.090629,13.932641
4,Southeast,4,77653,77653,77653,4.046335,12.76089


In [31]:
# There is not much difference in the avg_review_score per region. Northeast is a little behind.