In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import time
import itertools as it
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_regression
import warnings
np.warnings.filterwarnings('ignore')

In [6]:
# Loading the data
train = pd.read_csv('kaggle_train.csv')
test = pd.read_csv('kaggle_test.csv')

In [7]:
# Separate into test and train
y_train = train.y
X_train = train.drop("y", axis = 1)

X_test = test

X_train_columns = X_train.columns
X_test_columns = X_test.columns

In [15]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train_columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_train_columns)

# Impute the missing values
X_imputed = pd.DataFrame(KNNImputer(n_neighbors = 8).fit_transform(X_train), columns = X_train.columns)
X_test_imputed = pd.DataFrame(KNNImputer(n_neighbors = 8).fit_transform(X_test), columns = X_test.columns)

# SelectKBest to select the best 350 features of the dataset
# Fit using the imputed X_train data
# Get_support() gives an array with Booleans with True being the predictors selected and False being the predictors removed
selected_features = SelectKBest(f_regression, k = 500).fit(X_imputed, y_train).get_support()

In [16]:
# Making a new dataframe with all the predictors that are selected
X_train_small = X_imputed[X_imputed.columns[selected_features]]

In [17]:
X_test_small = X_test_imputed[X_test_imputed.columns[selected_features]]

In [6]:
# Coarse search
param_grid = {
    'max_depth': list(range(1, 202, 20)),
    'n_estimators': [100, 200, 400, 800],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

cv = KFold(n_splits=2, shuffle=True, random_state=1)

rf_coarse_grid_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=1),
    param_distributions=param_grid,
    verbose=1,
    n_jobs=-1,
    cv=cv,
    scoring='neg_mean_absolute_error',
    n_iter=50,
    random_state=1
)

rf_coarse_grid_result = rf_coarse_grid_search.fit(X_train_small, y_train)

# Store results in a DataFrame
rf_cv_results = pd.DataFrame(rf_coarse_grid_result.cv_results_)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


In [7]:
print('Best MAE Score Through Grid Search : %.3f'%rf_coarse_grid_search.best_score_)
print('Best Parameters : ',rf_coarse_grid_search.best_params_)

Best MAE Score Through Grid Search : -5.367
Best Parameters :  {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 201, 'bootstrap': False}


In [18]:
rf_model = RandomForestRegressor(random_state=1, verbose = 1, n_estimators = 400,
                                 max_features= 'sqrt', max_depth = 201, 
                                 min_samples_split = 2, min_samples_leaf = 4,
                                 max_leaf_nodes = 4000, n_jobs=-1, bootstrap = False).fit(X_train_small,y_train)

pred = rf_model.predict(X_test_small)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    8.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.3s finished


In [19]:
y = pd.Series(pred, name = 'y')
y[y<=1] = 1
sub = pd.concat([test['id'], y], axis = 1, ignore_index=True)
sub.columns = ['id',"y"]
sub.to_csv('submission.csv', index=False)