In [1]:
import pandas as pd
import numpy as np
import random
import os
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
import matplotlib.pyplot as plt


In [2]:
# Read csv
avocado = pd.read_csv("avocado-updated-2020.csv")


In [3]:
avocado.head()

Unnamed: 0,date,average_price,total_volume,4046,4225,4770,total_bags,small_bags,large_bags,xlarge_bags,type,year,geography
0,2015-01-04,1.22,40873.28,2819.5,28287.42,49.9,9716.46,9186.93,529.53,0.0,conventional,2015,Albany
1,2015-01-04,1.79,1373.95,57.42,153.88,0.0,1162.65,1162.65,0.0,0.0,organic,2015,Albany
2,2015-01-04,1.0,435021.49,364302.39,23821.16,82.15,46815.79,16707.15,30108.64,0.0,conventional,2015,Atlanta
3,2015-01-04,1.76,3846.69,1500.15,938.35,0.0,1408.19,1071.35,336.84,0.0,organic,2015,Atlanta
4,2015-01-04,1.08,788025.06,53987.31,552906.04,39995.03,141136.68,137146.07,3990.61,0.0,conventional,2015,Baltimore/Washington


In [4]:
columns_to_drop = [ '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags', 'type']
avo_df = avocado.drop(columns_to_drop, axis=1)

In [5]:
avo_df.head()

Unnamed: 0,date,average_price,total_volume,year,geography
0,2015-01-04,1.22,40873.28,2015,Albany
1,2015-01-04,1.79,1373.95,2015,Albany
2,2015-01-04,1.0,435021.49,2015,Atlanta
3,2015-01-04,1.76,3846.69,2015,Atlanta
4,2015-01-04,1.08,788025.06,2015,Baltimore/Washington


In [6]:
avo_df.rename(columns={'geography': 'region'})

Unnamed: 0,date,average_price,total_volume,year,region
0,2015-01-04,1.22,40873.28,2015,Albany
1,2015-01-04,1.79,1373.95,2015,Albany
2,2015-01-04,1.00,435021.49,2015,Atlanta
3,2015-01-04,1.76,3846.69,2015,Atlanta
4,2015-01-04,1.08,788025.06,2015,Baltimore/Washington
...,...,...,...,...,...
33040,2020-11-29,1.47,1583056.27,2020,Total U.S.
33041,2020-11-29,0.91,5811114.22,2020,West
33042,2020-11-29,1.48,289961.27,2020,West
33043,2020-11-29,0.67,822818.75,2020,West Tex/New Mexico


In [7]:
regions = avo_df.geography.unique()
regions

array(['Albany', 'Atlanta', 'Baltimore/Washington', 'Boise', 'Boston',
       'Buffalo/Rochester', 'California', 'Charlotte', 'Chicago',
       'Cincinnati/Dayton', 'Columbus', 'Dallas/Ft. Worth', 'Denver',
       'Detroit', 'Grand Rapids', 'Great Lakes', 'Harrisburg/Scranton',
       'Hartford/Springfield', 'Houston', 'Indianapolis', 'Jacksonville',
       'Las Vegas', 'Los Angeles', 'Louisville', 'Miami/Ft. Lauderdale',
       'Midsouth', 'Nashville', 'New Orleans/Mobile', 'New York',
       'Northeast', 'Northern New England', 'Orlando', 'Philadelphia',
       'Phoenix/Tucson', 'Pittsburgh', 'Plains', 'Portland',
       'Raleigh/Greensboro', 'Richmond/Norfolk', 'Roanoke', 'Sacramento',
       'San Diego', 'San Francisco', 'Seattle', 'South Carolina',
       'South Central', 'Southeast', 'Spokane', 'St. Louis', 'Syracuse',
       'Tampa', 'Total U.S.', 'West', 'West Tex/New Mexico'], dtype=object)

In [8]:
group_by_region = avo_df.groupby(by=['geography'])
avo_df_avg = group_by_region.mean()
avo_df_avg = avo_df_avg.drop(['year'], axis=1)

In [9]:
from geopy.geocoders import Bing
from geopy.extra.rate_limiter import RateLimiter
geolocator = Bing(api_key='An04oHPSRwg934mEtL7zlfkYoCFkbHqVfOZ94W1xydN0u6DbJDKTQwm8eZRpwuz3', timeout=30)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=2)
regions_dict = {i : geolocator.geocode(i) for i in regions}

In [10]:
regions_df = pd.DataFrame(regions_dict)
regions_df_melted = regions_df.iloc[1:2,:].melt()
regions_df_melted.columns = ['region', 'co-ordinates']


  v = np.array(v, copy=False)
  subarr = np.array(values, dtype=dtype, copy=copy)


In [11]:
regions_df_melted

Unnamed: 0,region,co-ordinates
0,Albany,"(42.65172576904297, -73.75509643554688)"
1,Atlanta,"(33.7485466003418, -84.3915023803711)"
2,Baltimore/Washington,"(39.175262451171875, -76.67152404785156)"
3,Boise,"(43.61579132080078, -116.20157623291016)"
4,Boston,"(42.35899353027344, -71.05863189697266)"
5,Buffalo/Rochester,"(43.15570831298828, -77.612548828125)"
6,California,"(37.25300598144531, -119.66266632080078)"
7,Charlotte,"(35.223785400390625, -80.84114074707031)"
8,Chicago,"(41.88322830200195, -87.63240051269531)"
9,Cincinnati/Dayton,"(39.10369873046875, -84.51361083984375)"


In [12]:
regions_df_melted.dtypes


region          object
co-ordinates    object
dtype: object

In [15]:
regions_df_melted[['latitude', 'longitude']] = pd.DataFrame(regions_df_melted['co-ordinates'].tolist(), index=regions_df_melted.index)
avo_gdf = gpd.GeoDataFrame(
    regions_df_melted, geometry=gpd.points_from_xy(regions_df_melted.longitude, regions_df_melted.latitude))



In [21]:
avocado_data = avo_gdf.drop([25, 29, 35, 45,46, 51, 52])

In [22]:
avocado_data

Unnamed: 0,region,co-ordinates,latitude,longitude,geometry
0,Albany,"(42.65172576904297, -73.75509643554688)",42.651726,-73.755096,POINT (-73.75510 42.65173)
1,Atlanta,"(33.7485466003418, -84.3915023803711)",33.748547,-84.391502,POINT (-84.39150 33.74855)
2,Baltimore/Washington,"(39.175262451171875, -76.67152404785156)",39.175262,-76.671524,POINT (-76.67152 39.17526)
3,Boise,"(43.61579132080078, -116.20157623291016)",43.615791,-116.201576,POINT (-116.20158 43.61579)
4,Boston,"(42.35899353027344, -71.05863189697266)",42.358994,-71.058632,POINT (-71.05863 42.35899)
5,Buffalo/Rochester,"(43.15570831298828, -77.612548828125)",43.155708,-77.612549,POINT (-77.61255 43.15571)
6,California,"(37.25300598144531, -119.66266632080078)",37.253006,-119.662666,POINT (-119.66267 37.25301)
7,Charlotte,"(35.223785400390625, -80.84114074707031)",35.223785,-80.841141,POINT (-80.84114 35.22379)
8,Chicago,"(41.88322830200195, -87.63240051269531)",41.883228,-87.632401,POINT (-87.63240 41.88323)
9,Cincinnati/Dayton,"(39.10369873046875, -84.51361083984375)",39.103699,-84.513611,POINT (-84.51361 39.10370)


In [23]:
avocado_data.to_csv("avocado_data.csv", index=False, encoding ="utf-8-sig")

In [24]:
 os.path.getsize("avocado_data.csv") / 1000000

0.00639

#### https://towardsdatascience.com/mapping-avocado-prices-in-python-with-geopandas-geopy-and-matplotlib-c7e0ef08bc26