## <span style=color:blue>This notebook is fetching WinterWheat yields for an ML pipeline </spann>

<span style=color:blue>It pulls from USDA NASS.</span>


In [104]:
# This useful if I want to give unique names to directories or files
import datetime
def curr_timestamp():
    current_datetime = datetime.datetime.now()
    formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
    return formatted_datetime

### <span style=color:blue> Accessing USDA NASS, following code from https://towardsdatascience.com/harvest-and-analyze-agricultural-data-with-the-usda-nass-api-python-and-tableau-a6af374b8138.  In first cell below we define a class for interacting with the NASS QuickStats API, and in second cell we illustrate how to invoke that class </span>

In [105]:
# from https://towardsdatascience.com/harvest-and-analyze-agricultural-data-with-the-usda-nass-api-python-and-tableau-a6af374b8138
# with edits

#   Name:           c_usda_quick_stats.py
#   Author:         Randy Runtsch
#   Date:           March 29, 2022
#   Project:        Query USDA QuickStats API
#   Author:         Randall P. Runtsch
#
#   Description:    Query the USDA QuickStats api_GET API with a specified set of 
#                   parameters. Write the retrieved data, in CSV format, to a file.
#
#   See Quick Stats (NASS) API user guide:  https://quickstats.nass.usda.gov/api
#   Request a QuickStats API key here:      https://quickstats.nass.usda.gov/api#param_define
#
#   Attribution: This product uses the NASS API but is not endorsed or certified by NASS.
#
#   Changes
#

import urllib.request
from urllib.error import HTTPError
from requests.utils import requote_uri
import requests

# Retrieve NASS API key from environment variables (you have to get your own)
import os
#my_NASS_API_key = os.getenv('NASS_API_KEY')
my_NASS_API_key = 'CECCBC6B-9398-356F-9BCC-C326CBB2DFB4'

class c_usda_quick_stats:

    def __init__(self):

        # Set the USDA QuickStats API key, API base URL, and output file path where CSV files will be written. 

        # self.api_key = 'PASTE_YOUR_API_KEY_HERE'
        self.api_key = my_NASS_API_key

        self.base_url_api_get = 'http://quickstats.nass.usda.gov/api/api_GET/?key=' \
                                + self.api_key + '&'

    def get_data(self, parameters, file_path, file_name):

        # Call the api_GET api with the specified parameters. 
        # Write the CSV data to the specified output file.

        # Create the full URL and retrieve the data from the Quick Stats server.
        
        full_url = self.base_url_api_get + parameters        
        print(full_url)

        try:
            s_result = urllib.request.urlopen(full_url)
            # print(type(s_result))
            print(s_result.status, s_result.reason)
            # print(s_result.status_code)
            s_text = s_result.read().decode('utf-8')

            # Create the output file and write the CSV data records to the file.

            s_file_name = file_path + file_name
            o_file = open(s_file_name, "w", encoding="utf8")
            o_file.write(s_text)
            o_file.close()
        except HTTPError as error:
            print(error.code, error.reason)
        except requests.exceptions.RequestException as e:
            print(f"An error occurred while fetching the data: {e}")
        except ValueError as e:
            print(f"Failed to parse the response data: {e}")
        except:
            print(f"Failed because of unknown exception; perhaps the USDA NASS site is down")


<span style=color:blue>First, a test query based on Randall Runtsch...    </span>

In [106]:
# from https://towardsdatascience.com/harvest-and-analyze-agricultural-data-with-the-usda-nass-api-python-and-tableau-a6af374b8138
# with edits

#   Date:           March 29, 2022
#   Project:        Program controller to query USDA QuickStats API
#   Author:         Randall P. Runtsch
#
#   Description:    Create an instance of the c_usda_quick_stats class. Call it with
#                   the desired search parameter and output file name.
#
#   Attribution: This product uses the NASS API but is not endorsed or certified by NASS.
#
#   Changes
#

import sys
import urllib.parse

#output_dir = '/Users/rick/AG-CODE--v03/USDA-NASS--v01/OUTPUTS/'
output_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/'

# Create a string with search parameters, then create an instance of
# the c_usda_quick_stats class and use that to fetch data from QuickStats
# and write it to a file

# the QuickStats site is very senstivite to how the full URL is built up.
# For example, the following spec for the parameters works
# But if you replace the line "'&unit_desc=ACRES' + \" with
# the line "'&' + urllib.parse.quote('unit_desc-ACRES')"
# then the site responds saying that you have exceeded the 50,000 record limit for one query

parameters =    'source_desc=SURVEY' +  \
                '&' + urllib.parse.quote('sector_desc=FARMS & LANDS & ASSETS') + \
                '&' + urllib.parse.quote('commodity_desc=FARM OPERATIONS') + \
                '&' + urllib.parse.quote('statisticcat_desc=AREA OPERATED') + \
                '&unit_desc=ACRES' + \
                '&freq_desc=ANNUAL' + \
                '&reference_period_desc=YEAR' + \
                '&year__GE=1997' + \
                '&agg_level_desc=NATIONAL' + \
                '&' + urllib.parse.quote('state_name=US TOTAL') + \
                '&format=CSV'

stats = c_usda_quick_stats()

# Including curr_timestamp() into file name to keep outputs separated during development/exploration
s_json = stats.get_data(parameters, output_dir, 'national_farm_survey_acres_ge_1997_' + curr_timestamp() + '.csv')

http://quickstats.nass.usda.gov/api/api_GET/?key=CECCBC6B-9398-356F-9BCC-C326CBB2DFB4&source_desc=SURVEY&sector_desc%3DFARMS%20%26%20LANDS%20%26%20ASSETS&commodity_desc%3DFARM%20OPERATIONS&statisticcat_desc%3DAREA%20OPERATED&unit_desc=ACRES&freq_desc=ANNUAL&reference_period_desc=YEAR&year__GE=1997&agg_level_desc=NATIONAL&state_name%3DUS%20TOTAL&format=CSV
200 OK


<span style=color:blue>Now a query that fetches useful soybean yield data.  I am focused on the top Kansas states in the US, and on the years 2003 to 2022.   </span>

In [107]:
import sys
import urllib.parse

output_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/USDA-NASS--v01/'

# Create a string with search parameters, then create an instance of
# the c_usda_quick_stats class and use that to fetch data from QuickStats
# and write it to a file

# It took a while to get the parameter names just right...
#   The parameters names are listed in
#      https://quickstats.nass.usda.gov/param_define
#   (some additional resources in https://quickstats.nass.usda.gov/tutorials)
#   Also, look at the column names that show up in the csv files that you get back
parameters =    'source_desc=SURVEY' +  \
                '&sector_desc=CROPS' + \
                '&' + urllib.parse.quote('group_desc=FIELD CROPS') + \
                '&commodity_desc=WHEAT' + \
                '&statisticcat_desc=YIELD' + \
                '&geographic_level=STATE' + \
                '&agg_level_desc=COUNTY' + \
                '&state_name=KANSAS' + \
                '&year__GE=2003' + \
                '&year__LE=2023' + \
                '&format=CSV'

stats = c_usda_quick_stats()

# holding this timestamp; we may used it to import the created csv file
latest_curr_timestamp = curr_timestamp()
filename = 'wheat_yield_data__' + latest_curr_timestamp + '.csv'

# Including curr_timestamp() into file name to keep outputs separated during development/exploration
stats.get_data(parameters, output_dir, 'wheat_yield_data__' + latest_curr_timestamp + '.csv')


http://quickstats.nass.usda.gov/api/api_GET/?key=CECCBC6B-9398-356F-9BCC-C326CBB2DFB4&source_desc=SURVEY&sector_desc=CROPS&group_desc%3DFIELD%20CROPS&commodity_desc=WHEAT&statisticcat_desc=YIELD&geographic_level=STATE&agg_level_desc=COUNTY&state_name=KANSAS&year__GE=2003&year__LE=2023&format=CSV
200 OK


### <span style=color:blue>After inspecting the output we see that there is double counting.  In particular, see the columns for "short_desc".  So, we will drop all records with short_desc != "SOYBEANS - YIELD, MEASURED IN BU / ACRE"</span>

In [108]:
import pandas as pd

#output_dir = '/Users/rick/AG-CODE--v03/USDA-NASS--v01/OUTPUTS/'
output_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/USDA-NASS--v01/'

df = pd.read_csv(output_dir + filename)
#print(df.head())
print(len(df))

df1 = df[['short_desc']].drop_duplicates()
#print(df1.head(10))
print(len(df))

# keep only records about full yield
#df = df[df['short_desc'].isin(['WHEAT - YIELD, MEASURED IN BU / ACRE', 'WHEAT, WINTER - YIELD, MEASURED IN BU / ACRE'])]

## We decided to just select Winter Wheat. Because there are whole wheat yiled data just from 2003 ~ 2007
df = df[df['short_desc'].isin(['WHEAT, WINTER - YIELD, MEASURED IN BU / ACRE'])]
print(len(df))
# 10295

print()

# found some bad_county_names by visual inspection of the csv
bad_county_names = ['OTHER COUNTIES', 'OTHER (COMBINED) COUNTIES']
df = df[~df.county_name.isin(bad_county_names)]

print(len(df))
# 9952

print()

df2 = df[['state_name','county_name']].drop_duplicates()
print(len(df2))
# 559

# Note: using SQL I found that of the 559 state-county pairs total:
#          212 state-county pairs have data for all 20 years
#          347 state-county pairs have data for < 20 years
#
#          486 have year 2022
#          418 have year 2021
#          514 have year 2020
# I will live with that

# cleaning up a column name
df = df.rename(columns={'Value': 'yield'})

#output_dir = '/Users/rick/AG-CODE--v03/USDA-NASS--v01/OUTPUTS/'
output_file = 'repaired_yield__' + curr_timestamp() + '.csv'

df.to_csv(output_dir + output_file, index=False)

# I imported this table into postgres so that I could use SQL ...

6068
6068
1893

1817

105


### <span style=color:blue>Data cleaning and wranggling </span>
#### <span style=color:blue> Check if each county has yield data for each year    </span>

In [109]:
# Group the data by 'county' and get the minimum and maximum years for each county
county_years = df.groupby('county_name')['year'].agg(['min', 'max'])

start_year = 2003
end_year = 2022

# Iterate over each county and find missing years for each county
missing_years_by_county = {}
for county, years in county_years.iterrows():

    # Generate a list of all years between the start and end years for each county
    all_years = list(range(start_year, end_year + 1))

    # Find any missing years for each county by comparing the list of all years with the available years in the data
    missing_years_by_county[county] = list(set(all_years) - set(df[df['county_name'] == county]['year']))
    

print(missing_years_by_county)
print(len(missing_years_by_county))

{'ALLEN': [2016, 2017, 2018, 2020, 2010, 2012], 'ANDERSON': [2010, 2022], 'ATCHISON': [2016, 2012], 'BARBER': [2016, 2017, 2018, 2019, 2015], 'BARTON': [], 'BOURBON': [2019], 'BROWN': [2018], 'BUTLER': [2017], 'CHASE': [2013], 'CHAUTAUQUA': [2019, 2014], 'CHEROKEE': [2017, 2010], 'CHEYENNE': [2016], 'CLARK': [2014], 'CLAY': [2019, 2015], 'CLOUD': [2016, 2018, 2019, 2015], 'COFFEY': [2008, 2010, 2016, 2014], 'COMANCHE': [2012], 'COWLEY': [2017, 2018], 'CRAWFORD': [2018, 2013, 2014, 2015], 'DECATUR': [2018], 'DICKINSON': [2019], 'DONIPHAN': [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015], 'DOUGLAS': [2016], 'EDWARDS': [2016], 'ELK': [2019, 2020, 2015], 'ELLIS': [2017, 2021], 'ELLSWORTH': [2016, 2013], 'FINNEY': [2016, 2017, 2018, 2019], 'FORD': [2019, 2021, 2022], 'FRANKLIN': [2008, 2013], 'GEARY': [2020], 'GOVE': [2017], 'GRAHAM': [2016, 2017, 2018, 2019, 2013, 2014, 2015], 'GRANT': [2008, 2018], 'GRAY': [2018, 2021], 'GREELEY': [2018], 'GREENW

#### <span style=color:blue> Count omitted yield each counties</span>

In [110]:
lengths = {key: len(value) for key, value in missing_years_by_county.items()}
print(lengths)

{'ALLEN': 6, 'ANDERSON': 2, 'ATCHISON': 2, 'BARBER': 5, 'BARTON': 0, 'BOURBON': 1, 'BROWN': 1, 'BUTLER': 1, 'CHASE': 1, 'CHAUTAUQUA': 2, 'CHEROKEE': 2, 'CHEYENNE': 1, 'CLARK': 1, 'CLAY': 2, 'CLOUD': 4, 'COFFEY': 4, 'COMANCHE': 1, 'COWLEY': 2, 'CRAWFORD': 4, 'DECATUR': 1, 'DICKINSON': 1, 'DONIPHAN': 15, 'DOUGLAS': 1, 'EDWARDS': 1, 'ELK': 3, 'ELLIS': 2, 'ELLSWORTH': 2, 'FINNEY': 4, 'FORD': 3, 'FRANKLIN': 2, 'GEARY': 1, 'GOVE': 1, 'GRAHAM': 7, 'GRANT': 2, 'GRAY': 2, 'GREELEY': 1, 'GREENWOOD': 3, 'HAMILTON': 1, 'HARPER': 1, 'HARVEY': 0, 'HASKELL': 6, 'HODGEMAN': 5, 'JACKSON': 2, 'JEFFERSON': 3, 'JEWELL': 2, 'JOHNSON': 5, 'KEARNY': 3, 'KINGMAN': 1, 'KIOWA': 4, 'LABETTE': 2, 'LANE': 3, 'LEAVENWORTH': 0, 'LINCOLN': 5, 'LINN': 4, 'LOGAN': 5, 'LYON': 2, 'MARION': 1, 'MARSHALL': 5, 'MCPHERSON': 0, 'MEADE': 6, 'MIAMI': 3, 'MITCHELL': 0, 'MONTGOMERY': 1, 'MORRIS': 0, 'MORTON': 1, 'NEMAHA': 6, 'NEOSHO': 2, 'NESS': 2, 'NORTON': 4, 'OSAGE': 0, 'OSBORNE': 3, 'OTTAWA': 0, 'PAWNEE': 0, 'PHILLIPS': 1, 'P

In [111]:
print(len(lengths))

105


### <span style=color:blue> Add missing yield data for every County in Kansas state </span>
#### <span style=color:blue> Manullay added missed yield data using 'linear interpolation' method. But 'WYANDOTTE',  'DONIPHAN' county, there are so many omitted yield data per year. Then we set the yield value as 0.
 </span>

In [112]:
# Retrieves the data set with the omitted yield value added 
archives_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/ML-ARCHIVES--v01/'
output_file = '105Counties_repaired_yield.csv'

#### <span style=color:blue>Saving the csv I'm happy with in a designated place in my "archives" directory</span>

In [113]:
import shutil

output_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/USDA-NASS--v01/'
archives_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/ML-ARCHIVES--v01/'
src_file = output_file # from preceding cell
tgt_file = 'KENSAS_winter_wheat_yield_data_3.csv'

shutil.copyfile(output_dir + src_file, archives_dir + tgt_file)

'/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/ML-ARCHIVES--v01/KENSAS_winter_wheat_yield_data_3.csv'

#### <span style=color:blue>Projecting out the columns and records that I don't need for my ML learning table, and archiving that result, also. </span>

In [116]:
import pandas as pd

archives_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/ML-ARCHIVES--v01/'
tgt_file = 'KENSAS_winter_wheat_yield_data_3.csv'

df = pd.read_csv(archives_dir + tgt_file)
# print(df.head())

cols_to_keep = ['year','state_name','county_name','yield']
dfml = df[cols_to_keep]

print(dfml.head())
print()
print(dfml.shape[0])
# Note: this particular df has 2050 rows

# checking there are no null values for 'yield':
print(dfml[dfml['yield'].isnull()].head())

tgt_file_01 = 'd_KENSAS_WHEAT_3'
dfml.to_csv(archives_dir + tgt_file_01, index=False)
print('\nwrote file ', archives_dir + tgt_file_01)

   year state_name county_name  yield
0  2022     KANSAS       ALLEN  53.50
1  2021     KANSAS       ALLEN  54.90
2  2020     KANSAS       ALLEN  49.70
3  2019     KANSAS       ALLEN  44.50
4  2018     KANSAS       ALLEN  43.65

2101
Empty DataFrame
Columns: [year, state_name, county_name, yield]
Index: []

wrote file  /Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/ML-ARCHIVES--v01/d_KENSAS_WHEAT_3


In [117]:
sum_by_county = df.groupby('county_name')['yield'].sum()
sum_by_county = sum_by_county.sort_values(ascending=False)
print(sum_by_county)

county_name
BROWN        1057.5
NEMAHA       1022.6
MIAMI         962.8
RICE          959.3
CHEROKEE      951.5
              ...  
HAMILTON      632.7
COMANCHE      632.7
MORTON        618.1
DONIPHAN      245.0
WYANDOTTE     218.0
Name: yield, Length: 105, dtype: float64


In [118]:
# Check total yield volume in Kansas state
sum_by_year = df.groupby('year')['yield'].sum()
sum_by_year = sum_by_year.sort_values(ascending=False)
print(sum_by_year)

year
2003    5327.00
2016    5296.91
2021    5148.80
2017    5015.00
2019    4712.15
2020    4588.90
2012    4445.45
2018    4348.25
2022    4326.00
2010    4276.95
2013    4224.34
2009    4159.50
2005    4051.00
2015    4002.92
2004    3920.00
2008    3803.80
2011    3688.50
2006    3664.00
2014    3577.48
2007    3005.00
Name: yield, dtype: float64


In [119]:
# Checking our data after cleaning and wrangling our data 
county_years = df.groupby('county_name')['year'].agg(['min', 'max'])

start_year = 2003
end_year = 2022

missing_years_by_county = {}
for county, years in county_years.iterrows():

    # Generate a list of all years between the start and end years for each county
    all_years = list(range(start_year, end_year + 1))

    # Find any missing years for each county by comparing the list of all years with the available years in the data
    missing_years_by_county[county] = list(set(all_years) - set(df[df['county_name'] == county]['year']))
    
print(missing_years_by_county)
print(len(missing_years_by_county))

{'ALLEN': [], 'ANDERSON': [], 'ATCHISON': [], 'BARBER': [], 'BARTON': [], 'BOURBON': [], 'BROWN': [], 'BUTLER': [], 'CHASE': [], 'CHAUTAUQUA': [], 'CHEROKEE': [], 'CHEYENNE': [], 'CLARK': [], 'CLAY': [], 'CLOUD': [], 'COFFEY': [], 'COMANCHE': [], 'COWLEY': [], 'CRAWFORD': [], 'DECATUR': [], 'DICKINSON': [], 'DONIPHAN': [], 'DOUGLAS': [], 'EDWARDS': [], 'ELK': [], 'ELLIS': [], 'ELLSWORTH': [], 'FINNEY': [], 'FORD': [], 'FRANKLIN': [], 'GEARY': [], 'GOVE': [], 'GRAHAM': [], 'GRANT': [], 'GRAY': [], 'GREELEY': [], 'GREENWOOD': [], 'HAMILTON': [], 'HARPER': [], 'HARVEY': [], 'HASKELL': [], 'HODGEMAN': [], 'JACKSON': [], 'JEFFERSON': [], 'JEWELL': [], 'JOHNSON': [], 'KEARNY': [], 'KINGMAN': [], 'KIOWA': [], 'LABETTE': [], 'LANE': [], 'LEAVENWORTH': [], 'LINCOLN': [], 'LINN': [], 'LOGAN': [], 'LYON': [], 'MARION': [], 'MARSHALL': [], 'MCPHERSON': [], 'MEADE': [], 'MIAMI': [], 'MITCHELL': [], 'MONTGOMERY': [], 'MORRIS': [], 'MORTON': [], 'NEMAHA': [], 'NEOSHO': [], 'NESS': [], 'NORTON': [], '

## Add data feature - Plant area data from USDA-NASS
### <span style=color:blue>Get Planting progress values from USDA NASS </span>

In [120]:
# Setting
output_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/USDA-NASS--v01/'

In [121]:
parameters =    'source_desc=SURVEY' +  \
                '&sector_desc=CROPS' + \
                '&' + urllib.parse.quote('group_desc=FIELD CROPS') + \
                '&commodity_desc=WHEAT' + \
                '&' + urllib.parse.quote('statisticcat_desc=AREA PLANTED') + \
                '&' + urllib.parse.quote('short_desc=WHEAT, WINTER - ACRES PLANTED') + \
'&geographic_level=STATE' + \
                '&domain_desc=TOTAL' + \
                '&agg_level_desc=COUNTY' + \
                '&state_name=KANSAS' + \
                '&year__GE=2003' + \
                '&year__LE=2023' + \
                '&format=CSV'

stats = c_usda_quick_stats()

# holding this timestamp; we may used it to import the created csv file
latest_curr_timestamp = curr_timestamp()
filename = 'wheat_areaPlanted_data__' + latest_curr_timestamp + '.csv'

# Including curr_timestamp() into file name to keep outputs separated during development/exploration
stats.get_data(parameters, output_dir, 'wheat_areaPlanted_data__' + latest_curr_timestamp + '.csv')


http://quickstats.nass.usda.gov/api/api_GET/?key=CECCBC6B-9398-356F-9BCC-C326CBB2DFB4&source_desc=SURVEY&sector_desc=CROPS&group_desc%3DFIELD%20CROPS&commodity_desc=WHEAT&statisticcat_desc%3DAREA%20PLANTED&short_desc%3DWHEAT%2C%20WINTER%20-%20ACRES%20PLANTED&geographic_level=STATE&domain_desc=TOTAL&agg_level_desc=COUNTY&state_name=KANSAS&year__GE=2003&year__LE=2023&format=CSV
200 OK


In [None]:
output_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/USDA-NASS--v01/'

df_areaPlant = pd.read_csv(output_dir + filename)

In [None]:
bad_county_names = ['OTHER COUNTIES', 'OTHER (COMBINED) COUNTIES']
df_areaPlant = df_areaPlant[~df_areaPlant.county_name.isin(bad_county_names)]
df_areaPlant = df_areaPlant.rename(columns={'Value': 'area_planted'})

df_areaPlant = df_areaPlant[df_areaPlant['short_desc'].isin(['WHEAT, WINTER - ACRES PLANTED'])]

output_file = 'repaired_wheat_areaPlanted_data__' + curr_timestamp() + '.csv'

df_areaPlant.to_csv(output_dir + output_file, index=False)

In [None]:
print(df_areaPlant)

In [None]:
df2_areaPlant = df_areaPlant[['state_name','county_name']].drop_duplicates()
print(df2_areaPlant)
# Total number of counties : 105

In [None]:
# Group the data by 'county' and get the minimum and maximum years for each county
county_years = df_areaPlant.groupby('county_name')['year'].agg(['min', 'max'])

start_year = 2003
end_year = 2022

# Iterate over each county and find missing years for each county
missing_years_by_county = {}
for county, years in county_years.iterrows():

    # Generate a list of all years between the start and end years for each county
    all_years = list(range(start_year, end_year + 1))

    # Find any missing years for each county by comparing the list of all years with the available years in the data
    missing_years_by_county[county] = list(set(all_years) - set(df_areaPlant[df_areaPlant['county_name'] == county]['year']))
    
print(missing_years_by_county)
print(len(missing_years_by_county))

In [126]:
cols_to_keep = ['year','state_name','county_name','area_planted']
df_areaP = df_areaPlant[cols_to_keep]

print(df_areaP.head())
print()
print(df_areaP.shape[0])
# Note: this particular df has 2050 rows

# checking there are no null values for 'yield':
print(df_areaP[df_areaP['area_planted'].isnull()].head())

       year state_name county_name area_planted
10424  2022     KANSAS    CHEYENNE      103,500
10425  2021     KANSAS    CHEYENNE      103,500
10426  2020     KANSAS    CHEYENNE       95,000
10427  2019     KANSAS    CHEYENNE      113,600
10428  2018     KANSAS    CHEYENNE      116,200

1817
Empty DataFrame
Columns: [year, state_name, county_name, area_planted]
Index: []


In [127]:
archives_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/ML-ARCHIVES--v01/'
tgt_file = 'KENSAS_winter_wheat_areaPlant_data.csv'
df_areaP.to_csv(archives_dir + tgt_file, index=False)

In [143]:
import np

merged_df = pd.merge(dfml, df_areaP[['year', 'county_name', 'area_planted']], on=['year', 'county_name'], how='left')
# Rename the merged 'area_planted' column to avoid duplicates
merged_df.rename(columns={'area_planted_y': 'area_planted'}, inplace=True)

# Drop the unnecessary 'area_planted_x' column
merged_df.drop(columns='area_planted_x', inplace=True)

merged_df['area_planted'] = merged_df['area_planted'].str.replace(',', '').fillna(0).astype(np.int64)

print(merged_df)


      year state_name county_name  yield  area_planted
0     2022     KANSAS       ALLEN  53.50         12100
1     2021     KANSAS       ALLEN  54.90         13800
2     2020     KANSAS       ALLEN  49.70             0
3     2019     KANSAS       ALLEN  44.50          4300
4     2018     KANSAS       ALLEN  43.65             0
...    ...        ...         ...    ...           ...
2096  2007     KANSAS   WYANDOTTE  20.00           600
2097  2006     KANSAS   WYANDOTTE  43.00           400
2098  2005     KANSAS   WYANDOTTE  43.00           400
2099  2004     KANSAS   WYANDOTTE  49.00           800
2100  2003     KANSAS   WYANDOTTE  63.00           400

[2101 rows x 5 columns]


In [144]:
archives_dir = '/Users/jinholee/Desktop_local/2023_Spring_FoodSecurity/HW3/output/ML-ARCHIVES--v01/'
tgt_file = 'KENSAS_winter_wheat_yield_plant_data.csv'

merged_df.to_csv(archives_dir + tgt_file, index=False)