In [1]:
import pandas as pd
import numpy as np
import requests
import re
import nltk
from nltk.stem import WordNetLemmatizer
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
wnl = WordNetLemmatizer()
KEY = '13A3E641-0C10-317C-9C08-7EF10ACA1519'
DOMAIN = 'http://quickstats.nass.usda.gov/api'
cats = ['sector_desc','group_desc','statisticcat_desc','state_alpha']
#Methods
GET = '/api_GET/?key='+ KEY
VALUES = '/get_param_values/?key='+ KEY
COUNTS = '/get_counts/?key='+ KEY

In [304]:
def get_usda_data(category,unit):
    filename = 'survey_2006-2016_'+category+'.csv'
    f = open(filename,'wb')
    p_get = {'source_desc':'SURVEY',
         'sector_desc': 'CROPS',
         'group_desc': ["FIELD CROPS","FRUIT & TREE NUTS","VEGETABLES"],
         'statisticcat_desc':category,
         'commodity_desc':['ALMONDS', 'APPLES', 'APRICOTS', 'AVOCADOS',
           'BLUEBERRIES', 'BRAMBLEBERRIES', 'BOYSENBERRIES', 'CHERRIES',
           'GRAPEFRUIT', 'LEMONS', 'LIMES', 'ORANGES', 'TANGELOS',
           'TANGERINES', 'TEMPLES', 'CRANBERRIES', 'GRAPES', 'KIWIFRUIT',
           'MACADEMIAS', 'LOGANBERRIES', 'NECTARINES', 'OLIVES', 'PEACHES',
           'PEARS', 'PLUMS & PRUNES', 'STRAWBERRIES', 'RASPBERRIES','ASPARAGUSES', 
           'BROCCOLI', 'CARROTS','CAULIFLOWER', 'CELERY', 'CUCUMBERS', 
           'CANTALOUPES','ONIONS', 'PUMPKINS', 'SQUASH','HAYS','PEANUTS', 'RAPESEED', 
           'SOYBEANS', 'SUGARBEETS', 'SUNFLOWER','COTTON'],
         'unit_desc':unit,
         'util_practice_desc': ["UTILIZED","ALL UTILIZATION PRACTICES","UTILIZED, SHELLED"],
         'class_desc':["ALL CLASSES","TAME","WILD","ALFALFA","SWEET","TART",
                       "DRY EDIBLE","DRY","DRY, SPRING","DRY, SUMMER, NON-STORAGE",
                       "DRY, SUMMER, NON-STORAGE","DRY, SUMMER, STORAGE",
                      "COTTONSEED","PIMA","UPLAND"],
         'year':[2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016],
         'freq_desc':"ANNUAL",
         'agg_level_desc':"STATE",
         'reference_period_desc':"YEAR",
         'format':'CSV'}
    data = requests.get(DOMAIN+GET, params=p_get).content
    f.write(data) 
    f.close()
    print('Exported '+filename+'....')

def clean_data(category,unit):
    filename = 'survey_2006-2016_'+category+'.csv'
    df = pd.read_csv(filename)
    df = df[['commodity_desc','state_alpha', 'state_name','year','Value']]
    df["Value"] = df["Value"].apply(lambda x: float(str(x).replace(',','')))
    df = df.groupby(["year","state_alpha","commodity_desc"]).Value.sum().reset_index()
    #divide the value of onion by two in year 2015 due to overlapping calcuation
    df.loc[(df.year==2015) & (df.commodity_desc =="ONIONS"), 'Value'] = df.loc[(df.year==2015) & (df.commodity_desc =="ONIONS"), 'Value']/2
    #remove total US 
    df = df.drop(df[df.state_alpha == "US"].index)
    df = df.rename(columns = {'commodity_desc':'crop','Value':unit})
    df['crop'] = df['crop'].apply(lambda x: wnl.lemmatize(x.lower()))
    return df

In [226]:
get_usda_data('PRODUCTION','$')
get_usda_data('AREA HARVESTED','ACRES')

Exported survey_2006-2016_PRODUCTION.csv....
Exported survey_2006-2016_AREA HARVESTED.csv....


# Remove non-numerical values in column "Value" in file "survey_sales_data_2006-2016_all.csv" before proceeding to next execution

In [305]:
value = clean_data('PRODUCTION','value')
arces = clean_data('AREA HARVESTED','acres')

In [306]:
value.head()

Unnamed: 0,year,state_alpha,crop,value
0,2006,AL,cotton,165319000.0
1,2006,AL,peanut,68460000.0
2,2006,AL,soybean,20550000.0
3,2006,AR,cotton,650541000.0
4,2006,AR,soybean,688755000.0


In [307]:
arces.head()

Unnamed: 0,year,state_alpha,crop,acres
0,2006,AL,cotton,1120000.0
1,2006,AL,peanut,163000.0
2,2006,AL,soybean,150000.0
3,2006,AR,cotton,3480000.0
4,2006,AR,soybean,9210000.0


# Read Dependence Ratio Data

In [2]:
ratio = pd.read_excel('Dep Ratio.xlsx',sheetname = 1)
ratio = ratio[['crop', 'D=\nDependence On Insect Pollination',
       'P= Proportion Of Pollinators That Are\nHoney Beesf', 'Unnamed: 5',
       'Proportion of pollinators that are native bees (1 – P)',
       'Unnamed: 10']]
ratio = ratio.rename(columns={'D=\nDependence On Insect Pollination':'D',
              'P= Proportion Of Pollinators That Are\nHoney Beesf':'PH',
              'Proportion of pollinators that are native bees (1 – P)':'PN',
              'Unnamed: 5':'DH',
              'Unnamed: 10':'DN'})
ratio

Unnamed: 0,crop,D,PH,DH,PN,DN
0,Fruits and Nut,,,,,
1,almond,1.0,1.0,1.0,0.0,0.0
2,apple,1.0,0.9,0.9,0.1,0.1
3,apricot,0.7,0.8,0.56,0.2,0.14
4,avocado,1.0,0.9,0.9,0.1,0.1
5,blueberry,1.0,0.9,0.9,0.1,0.1
6,brambleberry,0.8,0.9,0.72,,0.0
7,Boysenberry,,,,0.1,0.0
8,cherry,0.9,0.9,0.81,0.1,0.09
9,grapefruit,0.8,0.9,0.72,0.1,0.08


# Join Two Table

In [314]:
ratio['crop'] = ratio['crop'].apply(lambda x: wnl.lemmatize(x.lower()))
value_acres = pd.merge(value, arces, how='outer', on=['crop','year','state_alpha'])
result = pd.merge(value_acres, ratio, how='left', on=['crop'])
result.head()

Unnamed: 0,year,state_alpha,crop,value,acres,D,PH,DH,PN,DN
0,2006,AL,cotton,165319000.0,1120000.0,0.2,0.8,0.16,0.2,0.04
1,2006,AL,peanut,68460000.0,163000.0,0.1,0.2,0.02,0.8,0.08
2,2006,AL,soybean,20550000.0,150000.0,0.1,0.5,0.05,0.5,0.05
3,2006,AR,cotton,650541000.0,3480000.0,0.2,0.8,0.16,0.2,0.04
4,2006,AR,soybean,688755000.0,9210000.0,0.1,0.5,0.05,0.5,0.05


In [315]:
writer = pd.ExcelWriter('2006-2016_production_value_cleaned.xlsx')
result.to_excel(writer,'Production Value')
writer.save()

# Clean dep ratio data, add crop code

In [None]:
ratio = pd.read_excel('Dep Ratio.xlsx',sheetname = 1)
ratio = ratio[['crop', 'D=\nDependence On Insect Pollination',
       'P= Proportion Of Pollinators That Are\nHoney Beesf', 'Unnamed: 5',
       'Proportion of pollinators that are native bees (1 – P)',
       'Unnamed: 10']]
ratio = ratio.rename(columns={'D=\nDependence On Insect Pollination':'D',
              'P= Proportion Of Pollinators That Are\nHoney Beesf':'PH',
              'Proportion of pollinators that are native bees (1 – P)':'PN',
              'Unnamed: 5':'DH',
              'Unnamed: 10':'DN'})
crop_ref = pd.read_excel('crop code and dependency ratio.xlsx')
crop_ref['crop'] = crop_ref['crop'].apply(lambda x: wnl.lemmatize(x.lower()))
ratio ['crop'] = ratio ['crop'].apply(lambda x: wnl.lemmatize(x.lower()))
ratio_1 = pd.merge(ratio, crop_ref, how='left', on='crop')
ratio_1.to_excel("dep ratio and crop code.xlsx")
#Hard code: 'legume seeds' to 'legume'.

In [14]:
filename = 'survey_yield_data_2006-2016_all.csv'
f = open(filename,'wb')
p_get = {'source_desc':'SURVEY',
     'sector_desc': 'CROPS',
     'group_desc': ["FIELD CROPS","FRUIT & TREE NUTS","VEGETABLES"],
     'commodity_desc':['ALMONDS', 'APPLES', 'APRICOTS', 'AVOCADOS',
       'BLUEBERRIES', 'BRAMBLEBERRIES', 'BOYSENBERRIES', 'CHERRIES',
       'GRAPEFRUIT', 'LEMONS', 'LIMES', 'ORANGES', 'TANGELOS',
       'TANGERINES', 'TEMPLES', 'CRANBERRIES', 'GRAPES', 'KIWIFRUIT',
       'MACADEMIAS', 'LOGANBERRIES', 'NECTARINES', 'OLIVES', 'PEACHES',
       'PEARS', 'PLUMS & PRUNES', 'STRAWBERRIES', 'RASPBERRIES','ASPARAGUSES', 
       'BROCCOLI', 'CARROTS','CAULIFLOWER', 'CELERY', 'CUCUMBERS', 
       'CANTALOUPES','ONIONS', 'PUMPKINS', 'SQUASH','HAYS','PEANUTS', 'RAPESEED', 
       'SOYBEANS', 'SUGARBEETS', 'SUNFLOWER','COTTON','LEGUMES'],
     'statisticcat_desc':'YIELD',
     'util_practice_desc': ["UTILIZED","ALL UTILIZATION PRACTICES","UTILIZED, SHELLED"],
     'class_desc':["ALL CLASSES","TAME","WILD","ALFALFA","SWEET","TART",
                   "DRY EDIBLE","DRY","DRY, SPRING","DRY, SUMMER, NON-STORAGE",
                   "DRY, SUMMER, NON-STORAGE","DRY, SUMMER, STORAGE",
                  "COTTONSEED","PIMA","UPLAND"],
     'year':[2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016],
     'freq_desc':"ANNUAL",
     'agg_level_desc':"STATE",
     'reference_period_desc':["YEAR","MARKETING YEAR"],
     'format':'CSV'}
data = requests.get(DOMAIN+GET, params=p_get).content
f.write(data) 
f.close()
print('Successfully export '+filename+'....')

Successfully export survey_yield_data_2006-2016_all.csv....


In [15]:
filename = 'survey_price_data_2006-2016_all.csv'
f = open(filename,'wb')
p_get = {'source_desc':'SURVEY',
     'sector_desc': 'CROPS',
     'group_desc': ["FIELD CROPS","FRUIT & TREE NUTS","VEGETABLES"],
     'commodity_desc':['ALMONDS', 'APPLES', 'APRICOTS', 'AVOCADOS',
       'BLUEBERRIES', 'BRAMBLEBERRIES', 'BOYSENBERRIES', 'CHERRIES',
       'GRAPEFRUIT', 'LEMONS', 'LIMES', 'ORANGES', 'TANGELOS',
       'TANGERINES', 'TEMPLES', 'CRANBERRIES', 'GRAPES', 'KIWIFRUIT',
       'MACADEMIAS', 'LOGANBERRIES', 'NECTARINES', 'OLIVES', 'PEACHES',
       'PEARS', 'PLUMS & PRUNES', 'STRAWBERRIES', 'RASPBERRIES','ASPARAGUSES', 
       'BROCCOLI', 'CARROTS','CAULIFLOWER', 'CELERY', 'CUCUMBERS', 
       'CANTALOUPES','ONIONS', 'PUMPKINS', 'SQUASH','HAYS','PEANUTS', 'RAPESEED', 
       'SOYBEANS', 'SUGARBEETS', 'SUNFLOWER','COTTON','LEGUMES'],
     'statisticcat_desc':'PRICE RECEIVED',
     'util_practice_desc': ["UTILIZED","ALL UTILIZATION PRACTICES","UTILIZED, SHELLED"],
     'class_desc':["ALL CLASSES","TAME","WILD","ALFALFA","SWEET","TART",
                   "DRY EDIBLE","DRY","DRY, SPRING","DRY, SUMMER, NON-STORAGE",
                   "DRY, SUMMER, NON-STORAGE","DRY, SUMMER, STORAGE",
                  "COTTONSEED","PIMA","UPLAND"],
     'year':[2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016],
     'freq_desc':"ANNUAL",
     'agg_level_desc':"STATE",
     'reference_period_desc':["YEAR","MARKETING YEAR"],
     'format':'CSV'}
data = requests.get(DOMAIN+GET, params=p_get).content
f.write(data) 
f.close()
print('Successfully export '+filename+'....')

Successfully export survey_price_data_2006-2016_all.csv....


In [16]:
p = pd.read_csv('survey_price_data_2006-2016_all.csv')
y = pd.read_csv('survey_yield_data_2006-2016_all.csv')
p.columns

Index(['source_desc', 'sector_desc', 'group_desc', 'commodity_desc',
       'class_desc', 'prodn_practice_desc', 'util_practice_desc',
       'statisticcat_desc', 'unit_desc', 'short_desc', 'domain_desc',
       'domaincat_desc', 'agg_level_desc', 'state_ansi', 'state_fips_code',
       'state_alpha', 'state_name', 'asd_code', 'asd_desc', 'county_ansi',
       'county_code', 'county_name', 'region_desc', 'zip_5', 'watershed_code',
       'watershed_desc', 'congr_district_code', 'country_code', 'country_name',
       'location_desc', 'year', 'freq_desc', 'begin_code', 'end_code',
       'reference_period_desc', 'week_ending', 'load_time', 'Value', 'CV (%)'],
      dtype='object')

In [30]:
def ref_feature(row):
    f = row['short_desc'].split("-")[:-1]
    f.append(row['state_alpha'])
    f.append(row['year'])
    string = ','.join(str(v) for v in f)
    return string
p.apply (lambda row: ref_feature(row),axis=1)
y.apply (lambda row: ref_feature(row),axis=1)
p['ref_feature'] = p.apply (lambda row: ref_feature(row),axis=1)
y['ref_feature'] = y.apply (lambda row: ref_feature(row),axis=1)
p1 = p[['commodity_desc','short_desc','state_fips_code','year', 'unit_desc','Value','ref_feature']]
p1.columns = ['crop','short_price','state_code','year', 'unit_price','price','ref_feature']
y1 = y[['commodity_desc','short_desc','state_fips_code','year', 'unit_desc','Value','ref_feature']]
y1.columns = ['crop','short_yield','state_code','year','unit_yield', 'yield','ref_feature']
r = pd.merge(y1, p1, how='outer',on='ref_feature')
r.to_csv("r.csv")
r.state_code_x.fillna(r.state_code_y, inplace=True)
del r['state_code_y']
r.crop_x.fillna(r.crop_y, inplace=True)
del r['crop_y']
r.year_x.fillna(r.year_y, inplace=True)
del r['year_y']
r['crop_x'] = r['crop_x'].apply(lambda x: wnl.lemmatize(x.lower()))
r.to_csv("survey_yield_price_2006-2016_all.csv")
p1.to_csv("price.csv")
y1.to_csv("yield.csv")

In [20]:
r1=pd.read_csv("survey_yield_price_2006-2016_all.csv")
r1["yield"] = r1["yield"].apply(lambda x: float(str(x).replace(',','')))
r1["price"] = r1["price"].apply(lambda x: float(str(x).replace(',','')))
r1.columns = ['crop','short_yield','state_code','year','unit_yield','yield',
              'ref_feature','short_price','unit_price','price']
r1.head()

Unnamed: 0,crop,short_yield,state_code,year,unit_yield,yield,ref_feature,short_price,unit_price,price
0,cotton,"COTTON - YIELD, MEASURED IN LB / ACRE",1,2016,LB / ACRE,988.0,"COTTON ,AL,2016",,,
1,cotton,"COTTON - YIELD, MEASURED IN LB / ACRE",1,2015,LB / ACRE,866.0,"COTTON ,AL,2015",,,
2,cotton,"COTTON - YIELD, MEASURED IN LB / ACRE",1,2014,LB / ACRE,901.0,"COTTON ,AL,2014",,,
3,cotton,"COTTON - YIELD, MEASURED IN LB / ACRE",1,2013,LB / ACRE,789.0,"COTTON ,AL,2013",,,
4,cotton,"COTTON - YIELD, MEASURED IN LB / ACRE",1,2012,LB / ACRE,946.0,"COTTON ,AL,2012",,,


In [21]:
grouped = r1.groupby(["year","state_code","crop"])
def wavg(group):
    d = group['price']
    w = group['yield']
    return (d*w).sum()/w.sum()
group_p = grouped.apply(wavg).reset_index()
group_p.columns = ['year','state_code','crop','weighted_price']
group_y = grouped['yield'].max().reset_index()
group_y.columns = ['year','state_code','crop','total_yield']
r2 = pd.merge(group_y,group_p, how='outer',on=['year','state_code','crop'])

  """


In [22]:
crop_ref = pd.read_excel('dep ratio and crop code.xlsx')
state_ref = pd.read_excel('state_code_ref.xlsx')
result = pd.merge(r2, crop_ref, how='left', on='crop')
result = pd.merge(result, state_ref, how='left', on='state_code')
result['state_code'] = result['state_code'].astype(int)
result["state_code_1"] = result["state_code"].map("{:02}".format)
result["crop_code_1"] = result["crop_code"].map("{:03}".format)
result['st_crop']=result["state_code_1"].map(str) + result["crop_code_1"]
table1 = result[['year','st_crop','crop','crop_code_1','state','state_code_1',
               'weighted_price','total_yield','D','PH','DH','PN','DN']]
table1.to_excel('survey_yield_price_2006-2016_all_final.xlsx')

In [23]:
g = pd.read_csv('all_states_gis.csv')
g1 = g[['ACRES', 'AREA', 'AREA_ACRES', 'COUNT', 'RasterName','VALUE']]
g1.columns = ['ACRES', 'AREA', 'AREA_ACRES', 'COUNT', 'RasterName','crop_code']
g1['RasterName'] = g1['RasterName'].str.lower()
g1.ACRES.fillna(g1.AREA, inplace=True)

state_ref = pd.read_excel('state_code_ref.xlsx')
state_ref['RasterName'] = state_ref['state'].str.lower()
g2 = pd.merge(g1, state_ref, how='inner', on='RasterName')
g2['state_code'] = g2['state_code'].map("{:02}".format)
g2["crop_code"] = g2["crop_code"].map("{:03}".format)
g2['st_crop']=g2["state_code"].map(str) + g2["crop_code"]
g3 = g2[["ACRES","COUNT","st_crop"]]
g3.to_excel('all_states_gis_cleaned.xlsx')
table2 = pd.merge(table1, g3, how='left', on='st_crop')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [24]:
rev = pd.read_excel('final table for join.xlsx')
rev1 = rev[['STID', 'Value','TRevenue','acres',
            'value/acre', 'Av_12_16', 'Dol_HBpAc', 'Dol_NPpAc']]
rev1.columns = ['state_code', 'crop_code','f_TRevenue','f_acres',
            'f_value/acre', 'f_Av_12_16', 'f_Dol_HBpAc', 'f_Dol_NPpAc']
rev1['state_code'] = rev1['state_code'].map("{:02}".format)
rev1["crop_code"] = rev1["crop_code"].map("{:03}".format)
rev1['st_crop']=rev1["state_code"].map(str) + rev1["crop_code"]
table3 = pd.merge(table2, rev1, how='left', on='st_crop')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [25]:
table3.to_excel('survey_yield_price_2006-2016_all_final_withRev.xlsx')