# Data Cleaning

### Import libraries

In [20]:
import pandas as pd
import numpy as np

### Load the raw data ready for cleaning

In [21]:
df = pd.read_csv('../data/climate-agriculture.csv')
df.head()

Unnamed: 0,Year,Country,Region,Crop_Type,Average_Temperature_C,Total_Precipitation_mm,CO2_Emissions_MT,Crop_Yield_MT_per_HA,Extreme_Weather_Events,Irrigation_Access_%,Pesticide_Use_KG_per_HA,Fertilizer_Use_KG_per_HA,Soil_Health_Index,Adaptation_Strategies,Economic_Impact_Million_USD
0,2001,India,West Bengal,Corn,1.55,447.06,15.22,1.737,8,14.54,10.08,14.78,83.25,Water Management,808.13
1,2024,China,North,Corn,3.23,2913.57,29.82,1.737,8,11.05,33.06,23.25,54.02,Crop Rotation,616.22
2,2001,France,Ile-de-France,Wheat,21.11,1301.74,25.75,1.719,5,84.42,27.41,65.53,67.78,Water Management,796.96
3,2001,Canada,Prairies,Coffee,27.85,1154.36,13.91,3.89,5,94.06,14.38,87.58,91.39,No Adaptation,790.32
4,1998,India,Tamil Nadu,Sugarcane,2.19,1627.48,11.81,1.08,9,95.75,44.35,88.08,49.61,Crop Rotation,401.72


### Check the shape of the data

In [22]:
df.shape

(10000, 15)

As we can see there are 10,000 rows and 15 columns, this might be small enough to not encounter any problems, however to be safe I will sample to 9000 rows.

In [23]:
df = df.sample(n=9000, random_state=42)

#### Checking the shape again to confirm the sampling

In [24]:
df.shape

(9000, 15)

The data has been successfully sampled. 

### Checking the the data set again 

In [25]:
df.head()

Unnamed: 0,Year,Country,Region,Crop_Type,Average_Temperature_C,Total_Precipitation_mm,CO2_Emissions_MT,Crop_Yield_MT_per_HA,Extreme_Weather_Events,Irrigation_Access_%,Pesticide_Use_KG_per_HA,Fertilizer_Use_KG_per_HA,Soil_Health_Index,Adaptation_Strategies,Economic_Impact_Million_USD
6252,2001,Brazil,Southeast,Rice,2.64,514.15,21.96,0.639,4,15.95,25.39,42.58,85.01,Drought-resistant Crops,116.07
4684,1995,India,Punjab,Vegetables,13.98,1940.83,6.11,4.1,6,74.79,33.24,74.95,96.05,Drought-resistant Crops,1026.14
1731,1996,Brazil,South,Rice,34.76,247.44,8.42,2.52,2,52.69,6.56,10.86,56.99,Water Management,837.28
4742,1994,China,Central,Corn,19.1,2042.7,4.88,1.52,8,93.98,42.48,52.78,65.33,No Adaptation,369.9
4521,2018,Nigeria,North Central,Vegetables,-2.34,2716.87,23.37,1.593,6,22.09,41.06,8.44,99.04,Drought-resistant Crops,555.35


As we can see the index is not in the correct order.

### Reset the index

In [28]:
df.reset_index(drop=True, inplace=True)

#### Check to see if the index has been successfully reset

In [29]:
df.head()

Unnamed: 0,Year,Country,Region,Crop_Type,Average_Temperature_C,Total_Precipitation_mm,CO2_Emissions_MT,Crop_Yield_MT_per_HA,Extreme_Weather_Events,Irrigation_Access_%,Pesticide_Use_KG_per_HA,Fertilizer_Use_KG_per_HA,Soil_Health_Index,Adaptation_Strategies,Economic_Impact_Million_USD
0,2001,Brazil,Southeast,Rice,2.64,514.15,21.96,0.639,4,15.95,25.39,42.58,85.01,Drought-resistant Crops,116.07
1,1995,India,Punjab,Vegetables,13.98,1940.83,6.11,4.1,6,74.79,33.24,74.95,96.05,Drought-resistant Crops,1026.14
2,1996,Brazil,South,Rice,34.76,247.44,8.42,2.52,2,52.69,6.56,10.86,56.99,Water Management,837.28
3,1994,China,Central,Corn,19.1,2042.7,4.88,1.52,8,93.98,42.48,52.78,65.33,No Adaptation,369.9
4,2018,Nigeria,North Central,Vegetables,-2.34,2716.87,23.37,1.593,6,22.09,41.06,8.44,99.04,Drought-resistant Crops,555.35


The index has been successfully reset. 