In [20]:
import pandas as pd
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV,TimeSeriesSplit

1. READING THE JSON FILE

In [21]:
f=open('algoparams_from_ui.txt')
data=json.load(f)

2. READING THE CSV FILE

In [22]:
df=pd.read_csv('iris.csv')

3. READING THE TARGET VARIABLE AND PREDICTION TYPE

In [23]:
target_col=data['design_state_data']['target']['target']
pred_type=data['design_state_data']['target']['prediction_type']

4. HANDLING MISSING VALUES

In [24]:
fh=data['design_state_data']['feature_handling']
for i in fh:
  feature_name=fh[i]['feature_name']
  feature_details=fh[i]['feature_details']
  if len(feature_details)>2 and feature_details['missing_values']=="Impute":
    impute_with=feature_details['impute_with']
    if impute_with=="Average of values":
      df[feature_name]=df[feature_name].fillna(df[feature_name].mean())
    else:
      df[feature_name]=df[feature_name].fillna(df[feature_name].max())


5. FEATURE REDUCTION

Tree-based (Random Forest):

In [25]:
feat_to_keep=int(data['design_state_data']['feature_reduction']['num_of_features_to_keep'])
trees=int(data['design_state_data']['feature_reduction']['num_of_trees'])
depth=int(data['design_state_data']['feature_reduction']['depth_of_trees'])
df = pd.get_dummies(df, columns=['species'])

rf=RandomForestRegressor(
    n_estimators=trees,
    max_depth=depth,
    max_features=feat_to_keep)

In [26]:
X=df.drop(target_col,axis=1)
y=df[target_col]

rf.fit(X,y)

In [27]:
imp=rf.feature_importances_
selected_features=df.columns[::-1][imp.argsort()[::-1]]

In [28]:
print(selected_features)

Index(['petal_width', 'species_Iris-setosa', 'sepal_width', 'petal_length',
       'species_Iris-versicolor', 'species_Iris-virginica'],
      dtype='object')


6. PREDICTION MODEL

In [29]:
model=data['design_state_data']['algorithms']['RandomForestRegressor']
if model['is_selected'] and pred_type=='Regression':
    a=df.drop(target_col,axis=1)
    b=df[target_col]
    b=b.astype(float)
    min_trees=int(model['min_trees'])
    max_trees=int(model['max_trees'])
    min_depth=int(model['min_depth'])
    max_depth=int(model['max_depth'])
    min_sample_value=int(model['min_samples_per_leaf_min_value'])
    max_sample_value=int(model['min_samples_per_leaf_max_value'])
    param_grid={'n_estimators':list(range(min_trees,max_trees+1)),
                'max_depth':list(range(min_depth,max_depth+1)),
                'min_samples_split': list(range(min_sample_value,max_sample_value+1)),
                'min_samples_leaf': list(range(min_sample_value,max_sample_value+1))}
    strategy=data['design_state_data']['hyperparameters']['cross_validation_stratergy']
    cv_num=int(data['design_state_data']['hyperparameters']['num_of_folds'])
    rf_reg=RandomForestRegressor(random_state=0)
    test_size = int(len(a) * 0.2)
    gs=GridSearchCV(estimator=rf_reg,param_grid=param_grid,cv=TimeSeriesSplit(n_splits=cv_num//2,test_size=test_size),scoring='neg_root_mean_squared_error')
    gs.fit(a,b)
    best_params=gs.best_params_
    score=-gs.best_score_

In [30]:
print(best_params)
print(score)


{'max_depth': 20, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 13}
0.3788061138762633
