### Exploratory Data Analysis

Import all necessary modules

In [1]:
import numpy as np
import pandas as pd

1. Load dataset and display n samples

In [2]:
# Utility method to load data from a file (csv, excel, text file as table)
def load_data(fpath, ftype='csv'):

    if ftype == 'excel':
        data = pd.read_excel(fpath)
    elif ftype == 'text':
        data = pd.read_table(fpath)
    else:
        data = pd.read_csv(fpath)

    return data

In [3]:
# Load dataset
data_path = ''
train_data = load_data(fpath = data_path + 'Train.csv', ftype = 'csv')
train_data.sample(5)

Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield
1849,ID_6D9J1SXQX87D,Gaya,Wazirganj,39,14,WetTillagePuddling FourWheelTracRotavator,2022-06-25,4,Manual_PuddledRandom,2022-06-23,...,hand,2022-10-29,7931.0,2022-11-01,machine,21,10,plowed_in_soil,0.111111,228
2991,ID_SSOOU6OY827U,Jamui,Jamui,8,8,TractorPlough,2022-08-06,4,Manual_PuddledRandom,2022-07-10,...,hand,2022-11-15,400.0,2023-01-13,machine,26,10,plowed_in_soil,0.181818,318
457,ID_X42SPGNS296M,Jamui,Jamui,9,9,TractorPlough,2022-07-25,4,Manual_PuddledRandom,2022-06-26,...,hand,2022-11-10,300.0,2022-12-20,machine,28,10,plowed_in_soil,0.136364,238
2001,ID_WZZ8DJA85XJJ,Jamui,Jamui,4,4,TractorPlough,2022-07-23,4,Manual_PuddledRandom,2022-06-26,...,hand,2022-11-08,200.0,2023-01-17,machine,25,10,plowed_in_soil,0.090909,80
1428,ID_K88Y6URSCAZW,Vaishali,Mahua,14,10,WetTillagePuddling TractorPlough BullockPlough...,2022-07-06,5,Manual_PuddledRandom,2022-06-25,...,hand,2022-10-15,300.0,2022-10-25,hand,29,10,plowed_in_soil,0.136364,325


2. List all features from dataset

In [4]:
# get features
features = train_data.columns.values
print("Feature List: ", features)

Feature List:  ['ID' 'District' 'Block' 'CultLand' 'CropCultLand' 'LandPreparationMethod'
 'CropTillageDate' 'CropTillageDepth' 'CropEstMethod' 'RcNursEstDate'
 'SeedingSowingTransplanting' 'SeedlingsPerPit' 'NursDetFactor'
 'TransDetFactor' 'TransplantingIrrigationHours'
 'TransplantingIrrigationSource' 'TransplantingIrrigationPowerSource'
 'TransIrriCost' 'StandingWater' 'OrgFertilizers' 'Ganaura' 'CropOrgFYM'
 'PCropSolidOrgFertAppMethod' 'NoFertilizerAppln' 'CropbasalFerts'
 'BasalDAP' 'BasalUrea' 'MineralFertAppMethod' 'FirstTopDressFert'
 '1tdUrea' '1appDaysUrea' '2tdUrea' '2appDaysUrea'
 'MineralFertAppMethod.1' 'Harv_method' 'Harv_date' 'Harv_hand_rent'
 'Threshing_date' 'Threshing_method' 'Residue_length' 'Residue_perc'
 'Stubble_use' 'Acre' 'Yield']


2. Display attribute info and differenciate categorical & continuous features

In [5]:
# display info
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870 entries, 0 to 3869
Data columns (total 44 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  3870 non-null   object 
 1   District                            3870 non-null   object 
 2   Block                               3870 non-null   object 
 3   CultLand                            3870 non-null   int64  
 4   CropCultLand                        3870 non-null   int64  
 5   LandPreparationMethod               3870 non-null   object 
 6   CropTillageDate                     3870 non-null   object 
 7   CropTillageDepth                    3870 non-null   int64  
 8   CropEstMethod                       3870 non-null   object 
 9   RcNursEstDate                       3787 non-null   object 
 10  SeedingSowingTransplanting          3870 non-null   object 
 11  SeedlingsPerPit                     3581 no

In [6]:
# find the type of features

# utility method to get categorial featutes
def get_categorical_features(features, data):

    categorical_features = []
    for feature in features:
        if data[feature].dtype == 'object':
            categorical_features.append(feature)
    return categorical_features

# utility method to get continuous featutes
def get_continuous_features(features, data):

    continuous_features = []
    for feature in features:
        if data[feature].dtype == 'int64' or data[feature].dtype == 'float64':
            continuous_features.append(feature)
    
    return continuous_features

In [7]:
cat_features = get_categorical_features(features, train_data)
con_features = get_continuous_features(features, train_data)

print("Categorical Features: ", cat_features)
print("Continuous Features: ", con_features)

print("No. of Features: ", len(features))
print("No. of Categorical Features: ", len(cat_features))
print("No. of Continuous Features: ", len(con_features))

Categorical Features:  ['ID', 'District', 'Block', 'LandPreparationMethod', 'CropTillageDate', 'CropEstMethod', 'RcNursEstDate', 'SeedingSowingTransplanting', 'NursDetFactor', 'TransDetFactor', 'TransplantingIrrigationSource', 'TransplantingIrrigationPowerSource', 'OrgFertilizers', 'PCropSolidOrgFertAppMethod', 'CropbasalFerts', 'MineralFertAppMethod', 'FirstTopDressFert', 'MineralFertAppMethod.1', 'Harv_method', 'Harv_date', 'Threshing_date', 'Threshing_method', 'Stubble_use']
Continuous Features:  ['CultLand', 'CropCultLand', 'CropTillageDepth', 'SeedlingsPerPit', 'TransplantingIrrigationHours', 'TransIrriCost', 'StandingWater', 'Ganaura', 'CropOrgFYM', 'NoFertilizerAppln', 'BasalDAP', 'BasalUrea', '1tdUrea', '1appDaysUrea', '2tdUrea', '2appDaysUrea', 'Harv_hand_rent', 'Residue_length', 'Residue_perc', 'Acre', 'Yield']
No. of Features:  44
No. of Categorical Features:  23
No. of Continuous Features:  21
