In [1]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd
from useful import *

# Machine Learning
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

# Others
from tqdm.notebook import tqdm
from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')

In [2]:
# List of data sets to use for making predictions. 
# The code can take a combination of data sets 4, 5, 6 or 7
data_name = ["4", "5"]
random_state = 64

# Column names to drop from the input data (can be improve)
to_drop_full = ["sentinel 1 date", "sentinel 2 date", "landsat date", 'landsat qa_pixel', 'sentinel 2 SCL', "sentinel 2 blue", "sentinel 2 green", "sentinel 2 red",
          "sentinel 2 nir", "landsat blue", "landsat green", "landsat red", "landsat nir08", "landsat lwir11",
          'sentinel 2 ari1', 'sentinel 2 ari2', 'sentinel 2 cri1', 'sentinel 2 cri2', 'sentinel 2 rdvi',
          'sentinel 2 gndvi', 'sentinel 2 savi', 'sentinel 2 evi', 'sentinel 2 tvi', 'sentinel 2 B05', 'sentinel 2 B06',
       'sentinel 2 B07', 'sentinel 2 B8A', 'sentinel 1 rvi', 'sentinel 1 vh', 'sentinel 1 vv']

to_drop_medium = ["sentinel 1 date", "sentinel 2 date", "landsat date", 'landsat qa_pixel', 'sentinel 2 SCL', "sentinel 2 blue", "sentinel 2 green", "sentinel 2 red"
                  , "landsat blue", "landsat green", "landsat red", 'sentinel 2 ari1', 'sentinel 2 ari2', 'sentinel 2 cri1', 'sentinel 2 cri2', "sentinel 2 nir"]

to_drop_small = ["sentinel 1 date", "sentinel 2 date", "landsat date", 'sentinel 2 SCL', 'landsat qa_pixel']

# I don't remember which one is the best but to_drop_full and to_drop_medium should be similar and the best
to_drop = to_drop_medium

In [3]:
crop_yield_data = pd.read_csv("Crop_Yield_Data_challenge_2.csv")
sat_datas = []

# Import the previously imported and save dataset from the import data notebook
for name in data_name:
    sat_datas.append(pd.read_csv("data/data_" + name + ".csv").drop(to_drop, axis=1))

# Remove nan value and transform string of list into a list (can be improve)
for col in sat_datas[0].columns:
    for sat_data in sat_datas:
        sat_data[col] = sat_data[col].str.replace(', nan', '')
        sat_data[col] = sat_data[col].str.replace('nan, ', '')
        sat_data[col] = sat_data[col].apply(literal_eval)

# Extract the statistical features of each dataset (can be improve)
features = []
for sat_data in sat_datas:
    features.append(np.array(generate_statistical_features_v4(sat_data)))

# Concatenate everything
y = crop_yield_data['Rice Yield (kg/ha)'].values
X = np.concatenate(features, axis = 1)

  0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/557 [00:00<?, ?it/s]

## Multiple regression type

In [4]:
# In this code block I will train 100 time for different random_state, 4 different model and get theirs score
# I will then find the x best one and train them again using another random_state for the data split but the same random_state for the model
# I will then use them to get the submission

scores = []
pbar = tqdm(range(100))
for random_state in pbar:
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    # Scale the data
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Regressors to train, parameter found with RandomizedSearchCV (can be improve)
    regressors = [
        ["XGBRegressor", XGBRegressor(
            random_state=random_state,
            colsample_bytree=.7,
            gamma=0,
            learning_rate=.05,
            max_depth=3,
            min_child_weight=1,
            n_estimators=100,
            subsample=.8)],
        ["ElasticNet", ElasticNet(
            random_state=random_state,
            alpha=0.01,
            l1_ratio=0.8,
            max_iter=1000,
            tol=0.1)],
        ["GradientBoostingRegressor", GradientBoostingRegressor(
            random_state=random_state, 
            criterion='squared_error', 
            learning_rate=0.05, 
            max_depth=3, 
            n_estimators=150,
            max_features=1,
            min_samples_leaf=1,
            min_samples_split=3,
            subsample=.6)],
        ["RandomForestRegressor", RandomForestRegressor(
            random_state=random_state,
            n_estimators=50, 
            max_depth=3,
            bootstrap=False,
            max_features="log2",
            min_samples_leaf=1,
            min_samples_split=5)]
    ]
    
    # train and evaluate each model
    for regressor in regressors:
        pbar.set_description("Processing {} for random_state {}".format(regressor[0], random_state))
        regressor[1].fit(X_train, y_train)
        s_in, s_out = get_score(regressor[1], X_train, y_train, X_test, y_test, False)
        scores.append([regressor[0], random_state, s_in, s_out])

  0%|          | 0/100 [00:00<?, ?it/s]

In [5]:
# Find the 150 best random state using mean score of train data and 2 * test data
n_best = 150
best_items = sorted(scores, key=lambda x: (x[3] + 2*x[2]), reverse=True)[:n_best]

total_in = 0
total_out = 0
for nested_list in best_items:
    total_in += nested_list[2]
    total_out += nested_list[3]

mean_in = total_in / len(best_items)
mean_out = total_out / len(best_items)

print("Mean in: {}, mean out: {}".format(mean_in, mean_out))

Mean in: 0.8536755660382582, mean out: 0.6521361212612233


In [None]:
# Train the 150 best model using a new random_state for slipt
regressors = []
scs = []
pbar = tqdm(best_items)
for s in pbar:
    random_state = s[1]
    pbar.set_description("Processing {} for random_state {}".format(s[0], random_state))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0001, random_state=random_state+1)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    scs.append(sc)
    if s[0] == "RandomForestRegressor":
        regressor = RandomForestRegressor(
            random_state=random_state,
            n_estimators=50, 
            max_depth=3,
            bootstrap=False,
            max_features="log2",
            min_samples_leaf=1,
            min_samples_split=5)
    elif s[0] == "XGBRegressor":
        regressor = XGBRegressor(
            random_state=random_state,
            colsample_bytree=.7,
            gamma=0,
            learning_rate=.05,
            max_depth=3,
            min_child_weight=1,
            n_estimators=100,
            subsample=.8)
    elif s[0] == "GradientBoostingRegressor":
        regressor = GradientBoostingRegressor(
            random_state=random_state, 
            criterion='squared_error', 
            learning_rate=0.05, 
            max_depth=3, 
            n_estimators=150,
            max_features=1,
            min_samples_leaf=1,
            min_samples_split=3,
            subsample=.6)
    else:
        regressor = ElasticNet(
            random_state=random_state,
            alpha=0.01,
            l1_ration=0.8,
            max_iter=1000,
            tol=0.1)
        
    regressor.fit(X_train, y_train)
    regressors.append(regressor)

  0%|          | 0/150 [00:00<?, ?it/s]

In [None]:
# Save models to a picle file
import pickle
pickle.dump(regressors, open('model.pkl', 'wb'))
pickle.dump(scs, open('scalers.pkl', 'wb'))