In [1]:
import pandas as pd
import numpy as np
from scipy.stats import uniform, randint
from copy import deepcopy

from os import path, getcwd, listdir, chdir
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV

import lightgbm as lgb
import xgboost as xgb

In [2]:
# import statsmodels.api as sm
# from statsmodels.formula.api import ols

In [3]:
SEED = 121

Load and preprocess the data

In [4]:
data_raw = pd.read_csv('./data/train_set_v3.txt', sep='\t')
data_raw.columns = ['hybrid', 'trial', 'year', 'location', 'yield',
                 'inbred1', 'inbred2', 
                 'genetic_inb1', 
                 'genetic_inb2', 'trial_cl1', 'trial_cl2']
data = data_raw[data_raw.columns[1:]].copy()
data.fillna('UNK', inplace=True) 
data = data.sample(len(data), replace=False, random_state=SEED)
y = data.pop('yield')
X = data

Apply the encoding on the whole features before splitting it into different datasets

In [5]:
le = LabelEncoder()
ohe = OneHotEncoder()
X = X.apply(le.fit_transform)#.apply(ohe.fit_transform)
X = ohe.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.036, random_state=SEED)
X_val, X_test, y_val, y_test  = train_test_split(X_test, y_test, test_size=0.45, random_state=SEED)
print(len(y_train), 'train examples')
print(len(y_val), 'validation examples')
print(len(y_test), 'test examples')



523367 train examples
10749 validation examples
8796 test examples


In [9]:
def run_regression(regressor, X_train, X_val, y_train, y_val, results, 
                   n_iter=1, early_stopping_rounds=100, cv=5, n_jobs=-1, SEED=121):
    gb_r = regressor(silent=True)
    param_dist = {"max_depth": randint(low=100, high=500),
                  "learning_rate" : uniform(loc=0.01, scale=0.1),
                  "num_leaves": randint(low=3_000, high=10_000),
                  "n_estimators": randint(low=2_000, high=10_000),
                  'lambda_l2': [0, 0.0001, .001, 0.01],
                  'boosting_type': ['gbdt', 'dart'], # 'goss'
                 }
    rand_search = RandomizedSearchCV(gb_r,  
                                     param_distributions=param_dist, 
                                     n_jobs=n_jobs,
                                     cv=cv, scoring="neg_root_mean_squared_error", 
                                     verbose=0,
                                     n_iter=n_iter, return_train_score=True,
                                     random_state=SEED
                                    )
    # d_train = lgb.Dataset(X_train, label=y_train)
    # d_val = lgb.Dataset(X_test, label=y_test)
    history = {}
    fit_params = {
        'early_stopping_rounds': early_stopping_rounds,
        'verbose': 0,
        'eval_metric': "rmse", 
        'eval_set': [[X_train, y_train], [X_val, y_val]],
        'eval_names': ['train', 'val'],
        'callbacks':[lgb.record_evaluation(history)]
    }
    rand_search.fit(X_train, y_train, **fit_params)

    model_best_ = rand_search.best_estimator_
    results[regressor.__name__] = deepcopy(rand_search)
    if 'history' not in results:
        results['history'] = {}
    results['history'][regressor.__name__] = deepcoy(history)


# d_train = lgb.Dataset(X_train, label=y_train)
# d_val = lgb.Dataset(X_val, label=y_val)
# params = {"max_depth": 100, 
#           "learning_rate" : 0.001, 
#           "num_leaves": 2000,  
#           "n_estimators": 7000,
#           "metric": "rmse"}

# model_lgb = lgb.train(params, d_train, valid_sets=[d_train, d_val], valid_names = ['train', 'val'])
# best_params = model_best_.get_params()
# best_model = lgb_r.train(best_params, d_train)
# preds = best_model.predict(X_test)

In [10]:
regressors_dict = {
    'LGBMRegressor': lgb.LGBMRegressor,
    'XGBRegressor': xgb.XGBRegressor,
    'RandomForestRegressor': RandomForestRegressor
}
results = dict()
for name, regressor in regressors_dict.items():
    print('treating...... ', name)
    run_regression(regressor, X_train, X_val, y_train, y_val, results, 
                   n_iter=20, early_stopping_rounds=100, cv=5, n_jobs=-1, SEED=121)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

In [None]:
preds = model_best_.predict(X_test)

In [None]:
lgb.LGBMRegressor.__name__

In [None]:
plt.plot(history['val']['rmse'])
plt.plot(history['train']['rmse'])

In [None]:
preds

In [None]:
plt.scatter(preds, y_test)
plt.plot([50, 300], [50, 300], 'r-')
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
model_lgb.eva

In [None]:
import sys
sys.argv

In [None]:
!pip install pydot

In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')

In [None]:
gpus