In [2]:
import pandas as pd

#Config for Displaying Pipeline
from sklearn import set_config
set_config(display='diagram')

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Preprocessing 
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

#Model
from sklearn.ensemble import RandomForestClassifier

In [3]:
train = pd.read_csv('train_yaOffsB.csv')
test = pd.read_csv('test_pFkWwen.csv')

### Divide Categorical and Numerical Features

In [11]:
train['Crop_Type'] = train['Crop_Type'].astype('object')
train['Soil_Type'] = train['Soil_Type'].astype('object')
train['Pesticide_Use_Category'] = train['Pesticide_Use_Category'].astype('object')
train['Season'] = train['Season'].astype('object')

test['Crop_Type'] = test['Crop_Type'].astype('object')
test['Soil_Type'] = test['Soil_Type'].astype('object')
test['Pesticide_Use_Category'] = test['Pesticide_Use_Category'].astype('object')
test['Season'] = test['Season'].astype('object')

In [25]:
#create X and y datasets for splitting 
X = train.drop(['ID','Crop_Damage'], axis=1)
y = train['Crop_Damage']

In [26]:
numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c !='ID']
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f']]

In [27]:
print('Numerical : ' + str(numerical_features))
print('Categorical : ' + str(categorical_features))

Numerical : ['Estimated_Insects_Count', 'Number_Doses_Week', 'Number_Weeks_Used', 'Number_Weeks_Quit']
Categorical : ['Crop_Type', 'Soil_Type', 'Pesticide_Use_Category', 'Season']


### Divide Data into Train and Validation

In [28]:
#import train_test_split library
from sklearn.model_selection import train_test_split

# create train test split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0)  

### Make Pipeline with Column Transformer

In [29]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'),
    KBinsDiscretizer(n_bins=3)), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'missing'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
)

### Make Pipeline for Building Random Forest Model

In [30]:
RF_Model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators = 100))

In [31]:
RF_Model.fit(X_train, y_train)

In [32]:
print(f'Test : {RF_Model.score(X_test, y_test):.3f}')
print(f'Train : {RF_Model.score(X_train, y_train):.3f}')

Test : 0.843
Train : 0.842


### Prediction of Test

In [33]:
sub_test = test.drop(['ID'], axis = 1)

In [34]:
sub_test_pred = RF_Model.predict(sub_test).astype(int)

In [35]:
AllSub = pd.DataFrame({ 'ID': test['ID'],
                       'Crop_Damage' : sub_test_pred
    
})

AllSub.to_csv("AV_JH_MK_in_Agri_RF_sub1.csv", index = False)

In [48]:
#AV LB Score - 0.77033