Data Set Clean

In [8]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import pandas._testing as tm


In [9]:
# Load files - [crop_recommendation.csv, sih_2019_dataset.csv, crop_family_names.csv]
crop_recommendation_load = "../Resources/crop_recommendation.csv"
crop_family_names_load = "../Resources/crop_family_names.csv"


In [10]:
# Read files and create dataframes
recommendation_data_df = pd.read_csv(crop_recommendation_load)
family_data_df = pd.read_csv(crop_family_names_load)

In [11]:
# Preview the first 5 lines of the the recommendation data
recommendation_data_df.head(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [12]:
family_data_df.head(5)

Unnamed: 0,crop_name,kingdom,order,family,genus,botanical_name
0,pigeon_pea,plantae,fabales,fabaceae,cajanus,cajanus cajan
1,papaya,plantae,brassicales,caricaceae,carica,carica papaya
2,chickpea,plantae,fabales,fabaceae,cicer,cicer arietinum
3,watermelon,plantae,cucurbitales,cucurbitaceae,citrullus,citrullus lanatus
4,orange,plantae,sapindales,rutaceae,citrus,citrus sinensis


In [13]:
# View the data types of each column
recommendation_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [14]:
# Rename the label column to crop_name
recommendation_data_df.rename(columns = {'label':'crop_name'},inplace = True)
recommendation_data_df.head(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,crop_name
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [15]:
# Format the float values to 2 decimal places for float columns
recommendation_data_df['temperature'] = recommendation_data_df['temperature'].map('{:,.2f}'.format)
recommendation_data_df['ph'] = recommendation_data_df['ph'].map('{:,.2f}'.format)
recommendation_data_df['humidity'] = recommendation_data_df['humidity'].map('{:,.2f}'.format)
recommendation_data_df['rainfall'] = recommendation_data_df['rainfall'].map('{:,.2f}'.format)


In [16]:
# Preview the changes made to the data
recommendation_data_df.head(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,crop_name
0,90,42,43,20.88,82.0,6.5,202.94,rice
1,85,58,41,21.77,80.32,7.04,226.66,rice
2,60,55,44,23.0,82.32,7.84,263.96,rice
3,74,35,40,26.49,80.16,6.98,242.86,rice
4,78,42,42,20.13,81.6,7.63,262.72,rice


In [17]:
# Extract the unique values in the crop_name column of the recommendation_data
crop_name_list = recommendation_data_df["crop_name"].unique().tolist()
print(crop_name_list)

['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas', 'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate', 'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple', 'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee']


In [18]:
# Cleaning the crop recommendation data to show crop names in a more readable format and more comon crop names
recommendation_data_df["crop_name"] = recommendation_data_df["crop_name"].replace(['kidneybeans', 'mungbean', 'blackgram', 'mothbeans', 'pigeonpeas', 'grapes'], ['kidney_bean', 'mung_bean', 'urad_bean', 'moth_bean', 'pigeon_pea', 'grape'])


In [19]:
# Preview the changes to data
recommendation_data_df["crop_name"].unique()

array(['rice', 'maize', 'chickpea', 'kidney_bean', 'pigeon_pea',
       'moth_bean', 'mung_bean', 'urad_bean', 'lentil', 'pomegranate',
       'banana', 'mango', 'grape', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [20]:
# Combine the family_data_df and recommendation_clean_data_df into one dataframe
recommend_family_data_df = pd.merge(recommendation_data_df, family_data_df, how="left", on=["crop_name", "crop_name"])
recommend_family_data_df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,crop_name,kingdom,order,family,genus,botanical_name
0,90,42,43,20.88,82.0,6.5,202.94,rice,plantae,poales,poaceae,oryza,oryza sativa
1,85,58,41,21.77,80.32,7.04,226.66,rice,plantae,poales,poaceae,oryza,oryza sativa
2,60,55,44,23.0,82.32,7.84,263.96,rice,plantae,poales,poaceae,oryza,oryza sativa
3,74,35,40,26.49,80.16,6.98,242.86,rice,plantae,poales,poaceae,oryza,oryza sativa
4,78,42,42,20.13,81.6,7.63,262.72,rice,plantae,poales,poaceae,oryza,oryza sativa


In [21]:
# Export the new dataset as a .csv file
recommend_family_data_df.to_csv('../Resources/crop_recommendation_family_clean.csv', index=False)