Machine Learning Algorithms
----------

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

df_model = df_model_saved[['label'] + features_base]
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df_model)
df_model_norm = pd.DataFrame(np_scaled, columns=['label'] + features_base)

df_model_norm['label'] = df_model_norm['label'].map(lambda x: 1 if x else -1)
train, validation = train_test_split(df_model_norm, test_size=0.2)
validation.reset_index(inplace=True)

model = LogisticRegressionCV(
    Cs=list(np.power(10.0, np.arange(-10, 10))), 
    penalty='l2', # 'l1', 'l2' 
    scoring='roc_auc',  # 'accuracy', 'roc_auc', 'neg_log_loss'
    cv=5, 
    random_state=777, 
    max_iter=10000, 
    fit_intercept=True, 
    solver='newton-cg',  # 'newton-cg', ‘lbfgs’, ‘sag’; ‘liblinear’
    tol=10
)
model_fit = model.fit(train[features_base], train['label'])
feature_importance = np.std(train[features_base], 0) * model_fit.coef_[0]

# The first column is the probability that the entry has the -1 label 
# and the second column is the probability that the entry has the +1 label.
prediction = pd.DataFrame(model_fit.predict_proba(validation[features_base]), columns=['NotCompleted', 'Completed'])
print('For all market, accuracy %.3f'%(model_fit.score(validation[features_base], validation['label'])))

df_check = validation[['label']].join(prediction)

i_m, F1 = 0, 0
precision, recall = 0, 0
for i in [j / 30 for j in range(1,21)]:
    Y_true = df_check['label']
    Y_predict = df_check['Completed'].map(lambda x: 1 if x > i else -1)
    f = metrics.f1_score(Y_true, Y_predict)
    p = metrics.precision_score(Y_true, Y_predict)
    r = metrics.recall_score(Y_true, Y_predict)
    if f > F1:
        i_m, F1 = i, f
        precision, recall = p, r
print('Maximum F1 score is %.3f at %.3f with precision %.3f and recall %.3f.'%(F1, i_m, precision, recall))

### Random Forest Classification

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

df_model = df_model_saved[['label'] + features_base]
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df_model)
df_model_norm = pd.DataFrame(np_scaled, columns=['label'] + features_base)

df_model_norm['label'] = df_model_norm['label'].map(lambda x: 1 if x else -1)
train, validation = train_test_split(df_model_norm, test_size=0.2)
validation.reset_index(inplace=True)

model = RandomForestClassifier(n_jobs=-1)
parameters = {
    "n_estimators" : [20, 50, 100, 200, 300],
    "max_depth" : [4, 8, 16, 24, 32],
    "min_samples_leaf" : [1, 2, 4, 8, 16],
    "max_features":['sqrt', 'auto'],
    "criterion":['gini', 'entropy'],
    "bootstrap": [True, False]
     }
model_cv = GridSearchCV(estimator=model, param_grid=parameters, cv= 8)
model_cv.fit(train[features_base], train['label'])
print(model_cv.best_params_)

### XGBoost Regression And Random Forest Regression

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# What model to use
flag = 'xgb'

# Get train, test and predict DataFrame
train_data = sales[features_cols]
train_label = sales['log_diff']

train_data, test_data, train_label, test_label = train_test_split(train_data, train_label, 
                                                                test_size=0.05, random_state=42)

predict_data = sales[sales.fscl_mn_id == time_filter][features_cols]
predict_label = sales[sales.fscl_mn_id == time_filter]['log_diff']

# Grid search and cross validation
if flag == 'xgb':
    xgb = XGBRegressor(n_jobs=-1, silent=1, subsample=0.8, eval_metric='rmse')
    params = {
        "max_depth" : [3, 4, 5, 6, 7, 8],
        "n_estimators": [20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 240, 280, 320, 400],
        "min_child_weight": [1, 3, 5],
        "gamma": [i/10.0 for i in range(3,6)],
        "colsample_bytree": [i/10.0 for i in range(6,11)]
         }
    grid = GridSearchCV(estimator=xgb, param_grid=params, cv=8)
elif flag == 'rf':
    rf = RandomForestRegressor(n_jobs=-1)
    params = {
    "n_estimators" : [20, 40, 60, 80, 100, 120, 140, 160, 180, 200],
    "max_depth" : [4, 6, 8, 10, 12],
    "min_samples_leaf" : [1, 2, 4, 8, 10, 12],
    "criterion":['mse', 'mae']
     }
    grid = GridSearchCV(estimator=rf, param_grid=params, cv=8)

grid.fit(train_data, train_label)

print(grid.best_params_)

# Get prediction
ypred = grid.best_estimator_.predict(predict_data)