In [9]:
import pandas as pd

#Config for Displaying Pipeline
from sklearn import set_config
set_config(display='diagram')

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Preprocessing 
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.impute import SimpleImputer

#Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#GridSearch
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('train_yaOffsB.csv')
test = pd.read_csv('test_pFkWwen.csv')

### Divide Categorical and Numerical Features

In [3]:
train['Crop_Type'] = train['Crop_Type'].astype('object')
train['Soil_Type'] = train['Soil_Type'].astype('object')
train['Pesticide_Use_Category'] = train['Pesticide_Use_Category'].astype('object')
train['Season'] = train['Season'].astype('object')

test['Crop_Type'] = test['Crop_Type'].astype('object')
test['Soil_Type'] = test['Soil_Type'].astype('object')
test['Pesticide_Use_Category'] = test['Pesticide_Use_Category'].astype('object')
test['Season'] = test['Season'].astype('object')

In [4]:
#create X and y datasets for splitting 
X = train.drop(['ID','Crop_Damage'], axis=1)
y = train['Crop_Damage']

In [5]:
numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c !='ID']
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f']]

In [6]:
print('Numerical : ' + str(numerical_features))
print('Categorical : ' + str(categorical_features))

Numerical : ['Estimated_Insects_Count', 'Number_Doses_Week', 'Number_Weeks_Used', 'Number_Weeks_Quit']
Categorical : ['Crop_Type', 'Soil_Type', 'Pesticide_Use_Category', 'Season']


### Divide Data into Train and Validation

In [7]:
#import train_test_split library
from sklearn.model_selection import train_test_split

# create train test split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0, stratify = y)  

### Make Pipeline with Column Transformer

In [10]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'),
         MinMaxScaler(),
    KBinsDiscretizer(n_bins=5)), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'missing'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
)

### Make Pipeline for Building Random Forest Model

In [11]:
grid_param = [                
                {"randomforestclassifier__n_estimators": [100,150,200,300,500],
                 "randomforestclassifier__max_depth":[5,10,15],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]

In [12]:
RF_Model = make_pipeline(preprocessor, RandomForestClassifier(n_jobs = -1))

In [13]:
gridsearch = GridSearchCV(RF_Model, grid_param, cv=3, verbose=0,n_jobs=-1) # Fit grid search

In [14]:
gridsearch.fit(X_train, y_train)

  'decreasing the number of bins.' % jj)


In [15]:
print(f'Test : {gridsearch.score(X_test, y_test):.3f}')
print(f'Train : {gridsearch.score(X_train, y_train):.3f}')

Test : 0.840
Train : 0.839


### Prediction of Test

In [16]:
sub_test = test.drop(['ID'], axis = 1)

In [17]:
sub_test_pred = gridsearch.predict(sub_test).astype(int)

In [18]:
AllSub = pd.DataFrame({ 'ID': test['ID'],
                       'Crop_Damage' : sub_test_pred
    
})

AllSub.to_csv("AV_JH_MK_in_Agri_RF_sub2.csv", index = False)

In [19]:
#AV LB Score - 0.83