# NASS QuickStats data 

<br> Glossary: [here](https://quickstats.nass.usda.gov/src/glossary.pdf)

### <B> This notebook explains step by step detailed process behind the cleaning and pre processing of the NASS QuickStatsdata.
     

# DATA CLEANSING AND PRE PROCESSING

In [3]:
#import packages

import csv
import numpy as np
import pandas as pd
from datetime import date 
from datetime import datetime
import time 
from pandas_profiling import ProfileReport

In [4]:
#get an overview of the dataset by limiting records by 20000

data = pd.read_csv("data/qs.crops_20200429.txt", nrows=20000, sep='\t')
data.head()

Unnamed: 0,SOURCE_DESC,SECTOR_DESC,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,...,LOCATION_DESC,YEAR,FREQ_DESC,BEGIN_CODE,END_CODE,REFERENCE_PERIOD_DESC,WEEK_ENDING,LOAD_TIME,VALUE,CV_%
0,CENSUS,CROPS,HORTICULTURE,"FLOWERING PLANTS, POTTED","INDOOR USE, HYDRANGEA",ALL PRODUCTION PRACTICES,"RETAIL, POTS",SALES,POTS,"FLOWERING PLANTS, POTTED, INDOOR USE, HYDRANGE...",...,ALABAMA,2009,ANNUAL,0,0,YEAR,,2015-01-31 00:00:00,820,
1,CENSUS,CROPS,HORTICULTURE,"FLOWERING PLANTS, POTTED","INDOOR USE, HYDRANGEA",ALL PRODUCTION PRACTICES,"RETAIL, POTS",SALES,POTS,"FLOWERING PLANTS, POTTED, INDOOR USE, HYDRANGE...",...,ARKANSAS,2014,ANNUAL,0,0,YEAR,,2015-01-31 00:00:00,(D),(D)
2,CENSUS,CROPS,HORTICULTURE,"FLOWERING PLANTS, POTTED","INDOOR USE, HYDRANGEA",ALL PRODUCTION PRACTICES,"RETAIL, POTS",SALES,POTS,"FLOWERING PLANTS, POTTED, INDOOR USE, HYDRANGE...",...,CALIFORNIA,2009,ANNUAL,0,0,YEAR,,2015-01-31 00:00:00,10922,
3,CENSUS,CROPS,HORTICULTURE,"FLOWERING PLANTS, POTTED","INDOOR USE, HYDRANGEA",ALL PRODUCTION PRACTICES,"RETAIL, POTS",SALES,POTS,"FLOWERING PLANTS, POTTED, INDOOR USE, HYDRANGE...",...,CALIFORNIA,2014,ANNUAL,0,0,YEAR,,2015-01-31 00:00:00,885,7.4
4,CENSUS,CROPS,HORTICULTURE,"FLOWERING PLANTS, POTTED","INDOOR USE, HYDRANGEA",ALL PRODUCTION PRACTICES,"RETAIL, POTS",SALES,POTS,"FLOWERING PLANTS, POTTED, INDOOR USE, HYDRANGE...",...,COLORADO,2014,ANNUAL,0,0,YEAR,,2015-01-31 00:00:00,(D),(D)


### Data Chunking

In [5]:
# step1.dividing data into chunks 
# step2. filtering out data before 1990

file_path = 'data/qs.crops_20200429.txt'
chunksize = 1000
df_chunks = []

for df in pd.read_csv(file_path, chunksize=chunksize, iterator=True, sep='\t',low_memory=False ):
    df_f = df.loc[(df['YEAR']>=1990)]
    df_chunks.append(df_f)
    #print(df_chunks)
master_df = pd.concat(df_chunks)
print(master_df.head())

  SOURCE_DESC SECTOR_DESC    GROUP_DESC            COMMODITY_DESC  \
0      CENSUS       CROPS  HORTICULTURE  FLOWERING PLANTS, POTTED   
1      CENSUS       CROPS  HORTICULTURE  FLOWERING PLANTS, POTTED   
2      CENSUS       CROPS  HORTICULTURE  FLOWERING PLANTS, POTTED   
3      CENSUS       CROPS  HORTICULTURE  FLOWERING PLANTS, POTTED   
4      CENSUS       CROPS  HORTICULTURE  FLOWERING PLANTS, POTTED   

              CLASS_DESC       PRODN_PRACTICE_DESC UTIL_PRACTICE_DESC  \
0  INDOOR USE, HYDRANGEA  ALL PRODUCTION PRACTICES       RETAIL, POTS   
1  INDOOR USE, HYDRANGEA  ALL PRODUCTION PRACTICES       RETAIL, POTS   
2  INDOOR USE, HYDRANGEA  ALL PRODUCTION PRACTICES       RETAIL, POTS   
3  INDOOR USE, HYDRANGEA  ALL PRODUCTION PRACTICES       RETAIL, POTS   
4  INDOOR USE, HYDRANGEA  ALL PRODUCTION PRACTICES       RETAIL, POTS   

  STATISTICCAT_DESC UNIT_DESC  \
0             SALES      POTS   
1             SALES      POTS   
2             SALES      POTS   
3             

In [6]:
# get an overview of data commodity to filter out crops

master_df['COMMODITY_DESC'].unique()

array(['FLOWERING PLANTS, POTTED', 'BARLEY', 'BEANS', 'CANOLA', 'COFFEE',
       'CORN', 'COTTON', 'FLAXSEED', 'HAY & HAYLAGE', 'HAY', 'HOPS',
       'LENTILS', 'MAPLE SYRUP', 'MILLET', 'MUSTARD', 'OATS', 'PEANUTS',
       'PEAS', 'POTATOES', 'RAPESEED', 'RICE', 'RYE', 'SAFFLOWER',
       'SORGHUM', 'SOYBEANS', 'SUGARBEETS', 'SUNFLOWER', 'SWEET POTATOES',
       'TARO', 'TOBACCO', 'WHEAT', 'ARTICHOKES', 'ASPARAGUS', 'BROCCOLI',
       'BRUSSELS SPROUTS', 'CABBAGE', 'CAULIFLOWER', 'CELERY',
       'SWEET CORN', 'PICKLES', 'GARLIC', 'GINGER ROOT', 'GREENS', 'OKRA',
       'ONIONS', 'PEPPERS', 'PUMPKINS', 'RADISHES', 'SQUASH',
       'STRAWBERRIES', 'APPLES', 'APRICOTS', 'BOYSENBERRIES',
       'RASPBERRIES', 'CHERRIES', 'GRAPES', 'ALMONDS', 'PISTACHIOS',
       'WALNUTS', 'PAPAYAS', 'PEACHES', 'GRAPEFRUIT', 'K-EARLY CITRUS',
       'LEMONS', 'LIMES', 'ORANGES', 'TEMPLES', 'TANGELOS', 'TANGERINES',
       'FEED GRAINS & HAY', 'CROPS, OTHER', 'FOOD GRAINS',
       'FRUIT & TREE NUT TOTALS'

### Data Profiling

Data profiling is the process of reviewing source data, understanding structure, content and interrelationships, and identifying potential for data projects

In [7]:
#filtering out the required crop type

crops = ['WHEAT', 'RICE','COTTON','CORN','SOYBEANS']
master_df.COMMODITY_DESC.isin(crops)
filter_crops= master_df[master_df.COMMODITY_DESC.isin(crops)]
filter_crops.head()

Unnamed: 0,SOURCE_DESC,SECTOR_DESC,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,...,LOCATION_DESC,YEAR,FREQ_DESC,BEGIN_CODE,END_CODE,REFERENCE_PERIOD_DESC,WEEK_ENDING,LOAD_TIME,VALUE,CV_%
706737,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",2002,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,3500,
706738,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1997,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,6800,
706741,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",2010,ANNUAL,0,0,YEAR,,2015-06-03 15:08:31,3000,
706742,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1995,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,7000,
706743,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1996,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,7000,


In [8]:
filter_crops.shape

(3833173, 39)

In [9]:
filter_crops.columns

Index(['SOURCE_DESC', 'SECTOR_DESC', 'GROUP_DESC', 'COMMODITY_DESC',
       'CLASS_DESC', 'PRODN_PRACTICE_DESC', 'UTIL_PRACTICE_DESC',
       'STATISTICCAT_DESC', 'UNIT_DESC', 'SHORT_DESC', 'DOMAIN_DESC',
       'DOMAINCAT_DESC', 'AGG_LEVEL_DESC', 'STATE_ANSI', 'STATE_FIPS_CODE',
       'STATE_ALPHA', 'STATE_NAME', 'ASD_CODE', 'ASD_DESC', 'COUNTY_ANSI',
       'COUNTY_CODE', 'COUNTY_NAME', 'REGION_DESC', 'ZIP_5', 'WATERSHED_CODE',
       'WATERSHED_DESC', 'CONGR_DISTRICT_CODE', 'COUNTRY_CODE', 'COUNTRY_NAME',
       'LOCATION_DESC', 'YEAR', 'FREQ_DESC', 'BEGIN_CODE', 'END_CODE',
       'REFERENCE_PERIOD_DESC', 'WEEK_ENDING', 'LOAD_TIME', 'VALUE', 'CV_%'],
      dtype='object')

In [10]:
filter_crops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3833173 entries, 706737 to 19112390
Data columns (total 39 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SOURCE_DESC            object 
 1   SECTOR_DESC            object 
 2   GROUP_DESC             object 
 3   COMMODITY_DESC         object 
 4   CLASS_DESC             object 
 5   PRODN_PRACTICE_DESC    object 
 6   UTIL_PRACTICE_DESC     object 
 7   STATISTICCAT_DESC      object 
 8   UNIT_DESC              object 
 9   SHORT_DESC             object 
 10  DOMAIN_DESC            object 
 11  DOMAINCAT_DESC         object 
 12  AGG_LEVEL_DESC         object 
 13  STATE_ANSI             float64
 14  STATE_FIPS_CODE        int64  
 15  STATE_ALPHA            object 
 16  STATE_NAME             object 
 17  ASD_CODE               float64
 18  ASD_DESC               object 
 19  COUNTY_ANSI            float64
 20  COUNTY_CODE            float64
 21  COUNTY_NAME            object 
 22  REGION_DESC 

In [9]:
filter_crops.describe()

Unnamed: 0,STATE_ANSI,STATE_FIPS_CODE,ASD_CODE,COUNTY_ANSI,COUNTY_CODE,ZIP_5,WATERSHED_CODE,CONGR_DISTRICT_CODE,COUNTRY_CODE,YEAR,BEGIN_CODE,END_CODE
count,3739526.0,3833173.0,2218381.0,1840741.0,1980210.0,300614.0,3833173.0,0.0,3833173.0,3833173.0,3833173.0,3833173.0
mean,30.51904,31.92874,49.30453,97.26272,160.6932,51463.057156,20015.33,,9000.0,2005.906,6.55122,6.551396
std,14.41217,17.46798,26.22476,86.81253,245.1974,19956.443565,487812.6,,0.0,8.926106,12.85902,12.85903
min,1.0,0.0,10.0,1.0,1.0,1002.0,0.0,,9000.0,1990.0,0.0,0.0
25%,20.0,20.0,30.0,37.0,41.0,38341.0,0.0,,9000.0,1998.0,0.0,0.0
50%,30.0,30.0,50.0,79.0,85.0,53523.0,0.0,,9000.0,2007.0,0.0,0.0
75%,42.0,45.0,70.0,131.0,149.0,65330.0,0.0,,9000.0,2014.0,0.0,0.0
max,56.0,99.0,98.0,810.0,999.0,99999.0,20070000.0,,9000.0,2020.0,53.0,53.0


In [11]:
# describing the categortical variable

filter_crops.COUNTY_NAME.describe()

count                       1969586
unique                         1749
top       OTHER (COMBINED) COUNTIES
freq                         128813
Name: COUNTY_NAME, dtype: object

In [12]:
# list of columns

list_cols=[]
list_cols=list(filter_crops)
list_cols

['SOURCE_DESC',
 'SECTOR_DESC',
 'GROUP_DESC',
 'COMMODITY_DESC',
 'CLASS_DESC',
 'PRODN_PRACTICE_DESC',
 'UTIL_PRACTICE_DESC',
 'STATISTICCAT_DESC',
 'UNIT_DESC',
 'SHORT_DESC',
 'DOMAIN_DESC',
 'DOMAINCAT_DESC',
 'AGG_LEVEL_DESC',
 'STATE_ANSI',
 'STATE_FIPS_CODE',
 'STATE_ALPHA',
 'STATE_NAME',
 'ASD_CODE',
 'ASD_DESC',
 'COUNTY_ANSI',
 'COUNTY_CODE',
 'COUNTY_NAME',
 'REGION_DESC',
 'ZIP_5',
 'WATERSHED_CODE',
 'WATERSHED_DESC',
 'CONGR_DISTRICT_CODE',
 'COUNTRY_CODE',
 'COUNTRY_NAME',
 'LOCATION_DESC',
 'YEAR',
 'FREQ_DESC',
 'BEGIN_CODE',
 'END_CODE',
 'REFERENCE_PERIOD_DESC',
 'WEEK_ENDING',
 'LOAD_TIME',
 'VALUE',
 'CV_%']

In [13]:
#check for unique values of all the columns, it helps filter out the values of columns

for cols in list_cols:
    print( cols,':',  filter_crops[cols].unique())

SOURCE_DESC : ['SURVEY' 'CENSUS']
SECTOR_DESC : ['CROPS']
GROUP_DESC : ['FIELD CROPS']
COMMODITY_DESC : ['CORN' 'COTTON' 'RICE' 'SOYBEANS' 'WHEAT']
CLASS_DESC : ['ALL CLASSES' 'PIMA' 'COTTONSEED' 'UPLAND' 'LONG GRAIN' 'MEDIUM GRAIN'
 'SHORT GRAIN' 'SPRING, DURUM' 'SPRING, (EXCL DURUM)' 'WINTER' 'RED, HARD'
 'SPRING, RED, HARD' 'WHITE' 'WINTER, RED, HARD' 'WINTER, RED, SOFT'
 'MEDIUM-SHORT GRAIN' 'WINTER, WHITE, HARD' 'SPRING, WHITE, SOFT'
 'SPRING, WHITE, HARD' 'WINTER, WHITE, SOFT' 'SPRING, WHITE'
 'WINTER, WHITE' 'ROUGH, LONG GRAIN' 'ROUGH, MEDIUM GRAIN'
 'ROUGH, SHORT GRAIN' 'ROUGH' 'MILLED, BROKEN, BREWERS' 'MILLED, BROKEN'
 'MILLED' 'MILLED, WHOLE' 'MILLED, BROKEN, SCREENINGS'
 'MILLED, BROKEN, SECOND HEADS' 'MILLED, WHOLE, LONG GRAIN'
 'MILLED, WHOLE, MEDIUM GRAIN' 'MILLED, WHOLE, SHORT GRAIN'
 'LINT TO LINT & SEED' '(EXCL DURUM)' '(EXCL UPLAND)'
 'TRADITIONAL OR INDIAN']
PRODN_PRACTICE_DESC : ['IRRIGATED' 'NON-IRRIGATED' 'ALL PRODUCTION PRACTICES'
 'FOLLOWING ANOTHER CROP (DOUBL

STATE_FIPS_CODE : [ 8 10 20 30 31 35 38 40 46 48 56  1  4  5  6 12 13 22 28 29 37 45 47 51
 98 99  0 17  9 15 16 18 19 21 23 24 25 26 27 33 34 36 39 41 42 44 49 50
 53 54 55  2 32]
STATE_ALPHA : ['CO' 'DE' 'KS' 'MT' 'NE' 'NM' 'ND' 'OK' 'SD' 'TX' 'WY' 'AL' 'AZ' 'AR'
 'CA' 'FL' 'GA' 'LA' 'MS' 'MO' 'NC' 'SC' 'TN' 'VA' 'OT' 'US' nan 'IL' 'CT'
 'HI' 'ID' 'IN' 'IA' 'KY' 'ME' 'MD' 'MA' 'MI' 'MN' 'NH' 'NJ' 'NY' 'OH'
 'OR' 'PA' 'RI' 'UT' 'VT' 'WA' 'WV' 'WI' 'AK' 'NV']
STATE_NAME : ['COLORADO' 'DELAWARE' 'KANSAS' 'MONTANA' 'NEBRASKA' 'NEW MEXICO'
 'NORTH DAKOTA' 'OKLAHOMA' 'SOUTH DAKOTA' 'TEXAS' 'WYOMING' 'ALABAMA'
 'ARIZONA' 'ARKANSAS' 'CALIFORNIA' 'FLORIDA' 'GEORGIA' 'LOUISIANA'
 'MISSISSIPPI' 'MISSOURI' 'NORTH CAROLINA' 'SOUTH CAROLINA' 'TENNESSEE'
 'VIRGINIA' 'OTHER STATES' 'US TOTAL' nan 'ILLINOIS' 'CONNECTICUT'
 'HAWAII' 'IDAHO' 'INDIANA' 'IOWA' 'KENTUCKY' 'MAINE' 'MARYLAND'
 'MASSACHUSETTS' 'MICHIGAN' 'MINNESOTA' 'NEW HAMPSHIRE' 'NEW JERSEY'
 'NEW YORK' 'OHIO' 'OREGON' 'PENNSYLVANIA' 'RHO

WATERSHED_DESC : [nan 'CALIFORNIA, LAGUNA-SAN DIEGO COASTAL' 'CALIFORNIA, SOUTHERN MOJAVE'
 'CALIFORNIA, SALTON SEA' 'CALIFORNIA' 'CALIFORNIA, LOWER SACRAMENTO'
 'CALIFORNIA, TULARE-BUENA VISTA LAKES' 'CALIFORNIA, SAN JOAQUIN'
 'CALIFORNIA, SAN FRANCISCO BAY' 'RIO GRANDE, RIO GRANDE-ELEPHANT BUTTE'
 'RIO GRANDE, RIO GRANDE-CABALLO' 'RIO GRANDE, MINBRES'
 'RIO GRANDE, RIO GRANDE-FORT QUITMAN' 'RIO GRANDE, RIO GRANDE-AMISTAD'
 'RIO GRANDE, DEVILS' 'RIO GRANDE, RIO GRANDE CLOSED BASINS'
 'RIO GRANDE, UPPER PECOS' 'RIO GRANDE, LOWER PECOS'
 'RIO GRANDE, RIO GRANDE-FALCON' 'RIO GRANDE, LOWER RIO GRANDE'
 'LOWER COLORADO' 'LOWER COLORADO, LOWER COLORADO-LAKE MEAD'
 'LOWER COLORADO, LOWER COLORADO' 'LOWER COLORADO, BILL WILLIAMS'
 'LOWER COLORADO, UPPER GILA' 'LOWER COLORADO, MIDDLE GILA'
 'LOWER COLORADO, SAN PEDRO-WILLCOX' 'LOWER COLORADO, SANTA CRUZ'
 'LOWER COLORADO, SALT' 'LOWER COLORADO, VERDE'
 'LOWER COLORADO, LOWER GILA-AGUA FRIA' 'LOWER COLORADO, LOWER GILA'
 'LOWER COLORADO, RIO SO

COUNTRY_NAME : ['UNITED STATES']
LOCATION_DESC : ['COLORADO, NORTHEAST, BOULDER' 'COLORADO, NORTHEAST, JEFFERSON'
 'COLORADO, NORTHEAST, LARIMER' ... 60958 41021 85131]
YEAR : [2002 1997 2010 1995 1996 1998 1999 2001 2000 1993 1992 2003 1991 1994
 1990 2007 2006 2008 2005 2004 2016 2011 2017 2012 2013 2018 2014 2015
 2019 2009 2020]
FREQ_DESC : ['ANNUAL' 'POINT IN TIME' 'MONTHLY' 'WEEKLY']
BEGIN_CODE : [ 0  1  2  5 10 11 12  3  9  8  4  7  6 27 28 29 17 18 19 20 21 22 23 24
 25 26 30 31 32 33 34 35 36 37 38 39 40 41 42 43 16 15 44 45 46 13 14 47
 48 49 51 50 52 53]
END_CODE : [ 0  1  2  5 10 11 12  3  9  8  4  7  6 27 28 29 17 18 19 20 21 22 23 24
 25 26 30 31 32 33 34 35 36 37 38 39 40 41 42 43 16 15 44 45 46 13 14 47
 48 49 51 50 52 53]
REFERENCE_PERIOD_DESC : ['YEAR' 'FIRST OF JAN' 'FIRST OF FEB' 'FIRST OF MAY' 'MID OCT'
 'FIRST OF NOV' 'MID NOV' 'FIRST OF DEC' 'MID DEC' 'MID JAN'
 'FIRST OF MAR' 'MAY' 'FIRST OF OCT' 'MID SEP' 'FIRST OF SEP'
 'FIRST OF AUG' 'YEAR - AUG FORECAST' 'YE

VALUE : ['3,500' '6,800' '3,000' ... '1,374,134' '165,865' 913]
CV_% : [nan '20.6' '32.7' ... '99.0' '95.3' '99.2']


In [135]:
#area planted, area harvested, yield and production by crop

category = ['AREA PLANTED','AREA HARVESTED','PRODUCTION','YIELD']
filter_crops.STATISTICCAT_DESC.isin(category)

filter_category= filter_crops[filter_crops.STATISTICCAT_DESC.isin(category)]
filter_category.head()

Unnamed: 0,SOURCE_DESC,SECTOR_DESC,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,...,LOCATION_DESC,YEAR,FREQ_DESC,BEGIN_CODE,END_CODE,REFERENCE_PERIOD_DESC,WEEK_ENDING,LOAD_TIME,VALUE,CV_%
706737,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",2002,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,3500,
706738,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1997,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,6800,
706741,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",2010,ANNUAL,0,0,YEAR,,2015-06-03 15:08:31,3000,
706742,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1995,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,7000,
706743,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1996,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,7000,


In [136]:
#only county level data

region = ['COUNTY']
filter_category.AGG_LEVEL_DESC.isin(region)

filter_region= filter_category[filter_category.AGG_LEVEL_DESC.isin(region)]
filter_region.head()

Unnamed: 0,SOURCE_DESC,SECTOR_DESC,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,...,LOCATION_DESC,YEAR,FREQ_DESC,BEGIN_CODE,END_CODE,REFERENCE_PERIOD_DESC,WEEK_ENDING,LOAD_TIME,VALUE,CV_%
706737,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",2002,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,3500,
706738,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1997,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,6800,
706741,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",2010,ANNUAL,0,0,YEAR,,2015-06-03 15:08:31,3000,
706742,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1995,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,7000,
706743,SURVEY,CROPS,FIELD CROPS,CORN,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"CORN, IRRIGATED - ACRES PLANTED",...,"COLORADO, NORTHEAST, BOULDER",1996,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00,7000,


In [137]:
#cleaning specific columns and rows to ensure the count of rows are same
filter_region.shape

(1870368, 39)

### Selecting, Dropping, Renaming and Sorting columns from pandas dataframe

In [138]:
# decision of not considering this column as a part of the dataset
# total row size =1870368
# DOMAINCAT_DESC= "not specified" = 1705838
# 91% of column values are not specified; dropped this column

output = filter_region.drop_duplicates()
output.groupby('DOMAINCAT_DESC').size()

DOMAINCAT_DESC
AREA HARVESTED: (1,000 OR MORE ACRES)      15792
AREA HARVESTED: (1.0 TO 24.9 ACRES)        31839
AREA HARVESTED: (100 TO 249 ACRES)         31513
AREA HARVESTED: (25.0 TO 99.9 ACRES)       34269
AREA HARVESTED: (250 TO 499 ACRES)         25483
AREA HARVESTED: (500 OR MORE ACRES)         5117
AREA HARVESTED: (500 TO 999 ACRES)         20517
NOT SPECIFIED                            1705838
dtype: int64

In [139]:
watershed = filter_region.drop_duplicates()
watershed.groupby('WATERSHED_DESC').size()

Series([], dtype: int64)

### Drop columns 

1. SECTOR_DESC=: single category in column
2. GROUP_DESC=: single category in column
3. DOMAINCAT_DESC=: 79% of column values are not specified; dropped this column
4. STATE_ANSI=: keeping the name of the state for cleaner dataset
5. STATE_FIPS_CODE=:keeping the name of the state for cleaner dataset
6. STATE_ALPHA=: Keeping the state name only
7. CONGR_DISTRICT_CODE=: Nan
8. WATERSHED_DESC=: keeping county data, taking column off the dataframe; different geographic location
9. WATERSHED_CODE=: keeping county data, taking column off the dataframe; different geographic location
10. COUNTRY_CODE=: single category in column
11. BEGIN_CODE=: not significant for visualization
12. END_CODE=: not significant for visualization
13. LOAD_TIME=:  metadata about the dataset
14. REFERENCE_PERIOD_DESC
15. WEEK_ENDING
16. SOURCE_DESC=: aggregating data for both census and survey
17. ASD_CODE=:agricultural statistics district, different geographic location, restraining to county
18. ZIP_5=: different geographic location 


In [140]:
filter_region=filter_region.drop(columns=['SECTOR_DESC','GROUP_DESC','DOMAINCAT_DESC','STATE_ANSI',
                              'STATE_FIPS_CODE','STATE_ALPHA','CONGR_DISTRICT_CODE','WATERSHED_DESC',
                              'WATERSHED_CODE','COUNTRY_CODE','BEGIN_CODE','END_CODE','LOAD_TIME','REFERENCE_PERIOD_DESC',
                            'WEEK_ENDING','SOURCE_DESC','ASD_CODE','ZIP_5'])

In [141]:
filter_region.columns

Index(['COMMODITY_DESC', 'CLASS_DESC', 'PRODN_PRACTICE_DESC',
       'UTIL_PRACTICE_DESC', 'STATISTICCAT_DESC', 'UNIT_DESC', 'SHORT_DESC',
       'DOMAIN_DESC', 'AGG_LEVEL_DESC', 'STATE_NAME', 'ASD_DESC',
       'COUNTY_ANSI', 'COUNTY_CODE', 'COUNTY_NAME', 'REGION_DESC',
       'COUNTRY_NAME', 'LOCATION_DESC', 'YEAR', 'FREQ_DESC', 'VALUE', 'CV_%'],
      dtype='object')

In [142]:
# renaming column names
filter_region.rename(columns={'COMMODITY_DESC': 'FIELD_CROPS',
                                'CLASS_DESC': 'CROP_TYPE',
                                'PRODN_PRACTICE_DESC':'PRODN_PRACTICE',
                                'UTIL_PRACTICE_DESC':'UTILIZATION',
                                'STATISTICCAT_DESC':'CATEGORY',
                                'UNIT_DESC':'UNIT',
                                'SHORT_DESC':'ACTIVITY',
                                'DOMAIN_DESC':'DOMAIN',
                                'AGG_LEVEL_DESC':'GEOGRAPHIC_LEVEL',
                                'ASD_DESC':'AGRICULTURAL_DISTT',
                                'FREQ_DESC':'FREQUENCY',
                                'VALUE':'UNIT_VALUE',
                                'CV_%':'CV%'
                                },inplace=True)


In [143]:
filter_region.columns

Index(['FIELD_CROPS', 'CROP_TYPE', 'PRODN_PRACTICE', 'UTILIZATION', 'CATEGORY',
       'UNIT', 'ACTIVITY', 'DOMAIN', 'GEOGRAPHIC_LEVEL', 'STATE_NAME',
       'AGRICULTURAL_DISTT', 'COUNTY_ANSI', 'COUNTY_CODE', 'COUNTY_NAME',
       'REGION_DESC', 'COUNTRY_NAME', 'LOCATION_DESC', 'YEAR', 'FREQUENCY',
       'UNIT_VALUE', 'CV%'],
      dtype='object')

### restructure dataframe

In [144]:
filter_region=filter_region[['YEAR','GEOGRAPHIC_LEVEL','COUNTRY_NAME','STATE_NAME','AGRICULTURAL_DISTT','REGION_DESC','LOCATION_DESC',
                            'COUNTY_ANSI', 'COUNTY_CODE','COUNTY_NAME', 'FIELD_CROPS', 'CROP_TYPE', 'PRODN_PRACTICE', 'UTILIZATION', 'CATEGORY',
                            'ACTIVITY', 'DOMAIN', 'FREQUENCY','UNIT','UNIT_VALUE', 'CV%']]

In [145]:
#sorted dataframe based on Year values
sorted_data=filter_region.sort_values(['YEAR'])

In [146]:
sorted_data.shape

(1870368, 21)

In [147]:
sorted_data.head()

Unnamed: 0,YEAR,GEOGRAPHIC_LEVEL,COUNTRY_NAME,STATE_NAME,AGRICULTURAL_DISTT,REGION_DESC,LOCATION_DESC,COUNTY_ANSI,COUNTY_CODE,COUNTY_NAME,...,CROP_TYPE,PRODN_PRACTICE,UTILIZATION,CATEGORY,ACTIVITY,DOMAIN,FREQUENCY,UNIT,UNIT_VALUE,CV%
14202660,1990,COUNTY,UNITED STATES,PENNSYLVANIA,SOUTHEASTERN,,"PENNSYLVANIA, SOUTHEASTERN, LEBANON",75.0,75.0,LEBANON,...,ALL CLASSES,ALL PRODUCTION PRACTICES,SILAGE,AREA HARVESTED,"CORN, SILAGE - ACRES HARVESTED",TOTAL,ANNUAL,ACRES,13300,
4895068,1990,COUNTY,UNITED STATES,OKLAHOMA,SOUTHWEST,,"OKLAHOMA, SOUTHWEST, KIOWA",75.0,75.0,KIOWA,...,ALL CLASSES,"NON-IRRIGATED, CONTINUOUS CROP",ALL UTILIZATION PRACTICES,AREA PLANTED,"WHEAT, NON-IRRIGATED, CONTINUOUS CROP - ACRES ...",TOTAL,ANNUAL,ACRES,274500,
7759973,1990,COUNTY,UNITED STATES,TEXAS,EAST TEXAS NORTH,,"TEXAS, EAST TEXAS NORTH, RED RIVER",387.0,387.0,RED RIVER,...,UPLAND,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,YIELD,"COTTON, UPLAND - YIELD, MEASURED IN LB / ACRE",TOTAL,ANNUAL,LB / ACRE,381,
4895092,1990,COUNTY,UNITED STATES,OKLAHOMA,SOUTHWEST,,"OKLAHOMA, SOUTHWEST, TILLMAN",141.0,141.0,TILLMAN,...,ALL CLASSES,"NON-IRRIGATED, CONTINUOUS CROP",ALL UTILIZATION PRACTICES,AREA PLANTED,"WHEAT, NON-IRRIGATED, CONTINUOUS CROP - ACRES ...",TOTAL,ANNUAL,ACRES,173500,
4895116,1990,COUNTY,UNITED STATES,OKLAHOMA,NORTH CENTRAL,,"OKLAHOMA, NORTH CENTRAL, ALFALFA",3.0,3.0,ALFALFA,...,ALL CLASSES,"NON-IRRIGATED, CONTINUOUS CROP",ALL UTILIZATION PRACTICES,AREA PLANTED,"WHEAT, NON-IRRIGATED, CONTINUOUS CROP - ACRES ...",TOTAL,ANNUAL,ACRES,309500,


In [148]:
sorted_data.tail()

Unnamed: 0,YEAR,GEOGRAPHIC_LEVEL,COUNTRY_NAME,STATE_NAME,AGRICULTURAL_DISTT,REGION_DESC,LOCATION_DESC,COUNTY_ANSI,COUNTY_CODE,COUNTY_NAME,...,CROP_TYPE,PRODN_PRACTICE,UTILIZATION,CATEGORY,ACTIVITY,DOMAIN,FREQUENCY,UNIT,UNIT_VALUE,CV%
3148966,2019,COUNTY,UNITED STATES,ALABAMA,MOUNTAINS & EASTERN VALLEY,,"ALABAMA, MOUNTAINS & EASTERN VALLEY, CHEROKEE",19.0,19.0,CHEROKEE,...,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,"SOYBEANS - PRODUCTION, MEASURED IN BU",TOTAL,ANNUAL,BU,104000.0,
7193164,2019,COUNTY,UNITED STATES,NORTH CAROLINA,CENTRAL COASTAL,,"NORTH CAROLINA, CENTRAL COASTAL, PITT",147.0,147.0,PITT,...,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,CORN - ACRES PLANTED,TOTAL,ANNUAL,ACRES,24000.0,
3149020,2019,COUNTY,UNITED STATES,ALABAMA,MOUNTAINS & EASTERN VALLEY,,"ALABAMA, MOUNTAINS & EASTERN VALLEY, CULLMAN",43.0,43.0,CULLMAN,...,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,"SOYBEANS - PRODUCTION, MEASURED IN BU",TOTAL,ANNUAL,BU,362000.0,
3148689,2019,COUNTY,UNITED STATES,ALABAMA,NORTHERN VALLEY,,"ALABAMA, NORTHERN VALLEY, LIMESTONE",83.0,83.0,LIMESTONE,...,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,"SOYBEANS - PRODUCTION, MEASURED IN BU",TOTAL,ANNUAL,BU,1737000.0,
8141626,2019,COUNTY,UNITED STATES,MARYLAND,LOWER EASTERN SHORE,,"MARYLAND, LOWER EASTERN SHORE, SOMERSET",39.0,39.0,SOMERSET,...,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,YIELD,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",TOTAL,ANNUAL,BU / ACRE,45.5,


In [149]:
# exporting cleaned data to csv
now = datetime.now()
current_time=now.strftime("%m-%d-%Y,%H-%M-%S")

sorted_data.to_csv('data_exports/cleaned_data'+current_time+'.csv')

--------- this is the end of the file ----------------