## Exploratory Data Analysis

Import all necessary modules

In [1]:
import numpy as np
import pandas as pd

#### 1. Load dataset and display n samples

In [2]:
# Utility method to load data from a file (csv, excel, text file as table)
def load_data(fpath, ftype='csv'):

    if ftype == 'excel':
        data = pd.read_excel(fpath)
    elif ftype == 'text':
        data = pd.read_table(fpath)
    else:
        data = pd.read_csv(fpath)

    return data

In [3]:
# Load dataset
data_path = ''
train_data = load_data(fpath = data_path + 'Train.csv', ftype = 'csv')
train_data.sample(5)

Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield
2764,ID_41XI5A0QEHLZ,Vaishali,Mahua,20,5,TractorPlough FourWheelTracRotavator,2022-06-10,5,Manual_PuddledRandom,2022-06-14,...,hand,2022-10-17,400.0,2022-10-28,hand,30,10,plowed_in_soil,0.090909,4
2274,ID_FNUQ0UTCD1EL,Gaya,Gurua,20,20,TractorPlough FourWheelTracRotavator,2022-07-25,5,Manual_PuddledRandom,2022-06-21,...,hand,2022-11-17,120.0,2022-12-18,hand,30,10,plowed_in_soil,0.37037,800
778,ID_U1G5W1Q5SUYO,Vaishali,Garoul,30,6,TractorPlough,2022-07-02,6,Manual_PuddledRandom,2022-06-14,...,hand,2022-10-13,1200.0,2022-10-28,hand,30,10,plowed_in_soil,0.272727,540
3633,ID_PQ2TT4169UCX,Vaishali,Mahua,12,7,TractorPlough FourWheelTracRotavator,2022-06-23,4,Manual_PuddledRandom,2022-06-02,...,hand,2022-11-03,800.0,2022-11-18,machine,30,10,plowed_in_soil,0.181818,320
3136,ID_1HJOF8GN9ZLF,Jamui,Khaira,10,10,TractorPlough,2022-07-17,4,Manual_PuddledRandom,2022-07-27,...,hand,2022-12-05,130.0,2022-12-15,machine,28,10,plowed_in_soil,0.136364,240


#### 2. List all features from dataset

In [4]:
# get features
features = train_data.columns.values
print("Feature List: ", features)

Feature List:  ['ID' 'District' 'Block' 'CultLand' 'CropCultLand' 'LandPreparationMethod'
 'CropTillageDate' 'CropTillageDepth' 'CropEstMethod' 'RcNursEstDate'
 'SeedingSowingTransplanting' 'SeedlingsPerPit' 'NursDetFactor'
 'TransDetFactor' 'TransplantingIrrigationHours'
 'TransplantingIrrigationSource' 'TransplantingIrrigationPowerSource'
 'TransIrriCost' 'StandingWater' 'OrgFertilizers' 'Ganaura' 'CropOrgFYM'
 'PCropSolidOrgFertAppMethod' 'NoFertilizerAppln' 'CropbasalFerts'
 'BasalDAP' 'BasalUrea' 'MineralFertAppMethod' 'FirstTopDressFert'
 '1tdUrea' '1appDaysUrea' '2tdUrea' '2appDaysUrea'
 'MineralFertAppMethod.1' 'Harv_method' 'Harv_date' 'Harv_hand_rent'
 'Threshing_date' 'Threshing_method' 'Residue_length' 'Residue_perc'
 'Stubble_use' 'Acre' 'Yield']


#### 3. Display attribute info and differenciate categorical & continuous features

In [5]:
# display info
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870 entries, 0 to 3869
Data columns (total 44 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  3870 non-null   object 
 1   District                            3870 non-null   object 
 2   Block                               3870 non-null   object 
 3   CultLand                            3870 non-null   int64  
 4   CropCultLand                        3870 non-null   int64  
 5   LandPreparationMethod               3870 non-null   object 
 6   CropTillageDate                     3870 non-null   object 
 7   CropTillageDepth                    3870 non-null   int64  
 8   CropEstMethod                       3870 non-null   object 
 9   RcNursEstDate                       3787 non-null   object 
 10  SeedingSowingTransplanting          3870 non-null   object 
 11  SeedlingsPerPit                     3581 no

In [6]:
# find the type of features

# utility method to get categorial featutes
def get_categorical_features(features, data):

    categorical_features = []
    for feature in features:
        if data[feature].dtype == 'object':
            categorical_features.append(feature)
    return categorical_features

# utility method to get continuous featutes
def get_continuous_features(features, data):

    continuous_features = []
    for feature in features:
        if data[feature].dtype == 'int64' or data[feature].dtype == 'float64':
            continuous_features.append(feature)
    
    return continuous_features

In [7]:
cat_features = get_categorical_features(features, train_data)
con_features = get_continuous_features(features, train_data)

print("Categorical Features: ", cat_features)
print("Continuous Features: ", con_features)

print("No. of Features: ", len(features))
print("No. of Categorical Features: ", len(cat_features))
print("No. of Continuous Features: ", len(con_features))

Categorical Features:  ['ID', 'District', 'Block', 'LandPreparationMethod', 'CropTillageDate', 'CropEstMethod', 'RcNursEstDate', 'SeedingSowingTransplanting', 'NursDetFactor', 'TransDetFactor', 'TransplantingIrrigationSource', 'TransplantingIrrigationPowerSource', 'OrgFertilizers', 'PCropSolidOrgFertAppMethod', 'CropbasalFerts', 'MineralFertAppMethod', 'FirstTopDressFert', 'MineralFertAppMethod.1', 'Harv_method', 'Harv_date', 'Threshing_date', 'Threshing_method', 'Stubble_use']
Continuous Features:  ['CultLand', 'CropCultLand', 'CropTillageDepth', 'SeedlingsPerPit', 'TransplantingIrrigationHours', 'TransIrriCost', 'StandingWater', 'Ganaura', 'CropOrgFYM', 'NoFertilizerAppln', 'BasalDAP', 'BasalUrea', '1tdUrea', '1appDaysUrea', '2tdUrea', '2appDaysUrea', 'Harv_hand_rent', 'Residue_length', 'Residue_perc', 'Acre', 'Yield']
No. of Features:  44
No. of Categorical Features:  23
No. of Continuous Features:  21


#### 4. Missing Value Analysis & Imputation

In [8]:
# Utility method to impute categorical features
from sklearn.impute import SimpleImputer

def identify_and_impute_missing_cat(features, data):

    # Finding the no of missing values for each categorical feature
    missing_values_cat = data[features].isna().sum()

    # Display missing values
    print("Misisng Value Analysis: Categorical Features\n", missing_values_cat)

    # Imputing with the most frequent value (mode)
    mode_imputer = SimpleImputer(strategy='most_frequent')
    data[features] = mode_imputer.fit_transform(data[features])

    return data

# Utility method to impute continuous features
def identify_and_impute_missing_con(features, data):

    # Finding the no of missing values for each continuous feature
    missing_values_con = data[features].isna().sum()

    # Display missing values
    print("Misisng Value Analysis: Continuous Features\n", missing_values_con)

    # Imputing with the mean
    mean_imputer = SimpleImputer(strategy='mean')
    data[features] = mean_imputer.fit_transform(data[features])

    return data



In [9]:
# Imputing missing values for categorical features
train_data = identify_and_impute_missing_cat(cat_features, train_data)


Misisng Value Analysis: Categorical Features
 ID                                       0
District                                 0
Block                                    0
LandPreparationMethod                    0
CropTillageDate                          0
CropEstMethod                            0
RcNursEstDate                           83
SeedingSowingTransplanting               0
NursDetFactor                          289
TransDetFactor                         289
TransplantingIrrigationSource          115
TransplantingIrrigationPowerSource     503
OrgFertilizers                        1335
PCropSolidOrgFertAppMethod            1337
CropbasalFerts                         188
MineralFertAppMethod                     0
FirstTopDressFert                      485
MineralFertAppMethod.1                 481
Harv_method                              0
Harv_date                                0
Threshing_date                           0
Threshing_method                         0
Stubble_

In [10]:
# Imputing missing values for continuous features
train_data = identify_and_impute_missing_con(con_features, train_data)

Misisng Value Analysis: Continuous Features
 CultLand                           0
CropCultLand                       0
CropTillageDepth                   0
SeedlingsPerPit                  289
TransplantingIrrigationHours     193
TransIrriCost                    882
StandingWater                    238
Ganaura                         2417
CropOrgFYM                      2674
NoFertilizerAppln                  0
BasalDAP                         543
BasalUrea                       1704
1tdUrea                          556
1appDaysUrea                     556
2tdUrea                         2694
2appDaysUrea                    2700
Harv_hand_rent                   252
Residue_length                     0
Residue_perc                       0
Acre                               0
Yield                              0
dtype: int64


In [11]:
# Data info after imputation
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870 entries, 0 to 3869
Data columns (total 44 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  3870 non-null   object 
 1   District                            3870 non-null   object 
 2   Block                               3870 non-null   object 
 3   CultLand                            3870 non-null   float64
 4   CropCultLand                        3870 non-null   float64
 5   LandPreparationMethod               3870 non-null   object 
 6   CropTillageDate                     3870 non-null   object 
 7   CropTillageDepth                    3870 non-null   float64
 8   CropEstMethod                       3870 non-null   object 
 9   RcNursEstDate                       3870 non-null   object 
 10  SeedingSowingTransplanting          3870 non-null   object 
 11  SeedlingsPerPit                     3870 no