In [None]:
#!sudo hostname -s 127.0.0.1

import findspark
findspark.init('/usr/local/Cellar/apache-spark/2.4.0/libexec')

import pyspark
from pyspark.sql import SQLContext

import pandas as pd

sc = pyspark.SparkContext(appName="testApp").getOrCreate()
sqlContext = SQLContext(sc)

In [None]:
#Train spark.ml.lr model
# https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa

from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import *

d = [{'name': 'A1', 'features': 60, 'lbl':1},
     {'name': 'B1', 'features': 55, 'lbl':1},
     {'name': 'A1', 'features': 30, 'lbl':0},
     {'name': 'B1', 'features': 20, 'lbl':0},
     {'name': 'A1', 'features': 10, 'lbl':0},
     {'name': 'B1', 'features': 30, 'lbl':0},
     {'name': 'A1', 'features': 25, 'lbl':0},
     {'name': 'B1', 'features': 49, 'lbl':0}]
schema = StructType([
    StructField("name", StringType(), True),
    StructField("features", IntegerType(), True),
    StructField("lbl", IntegerType(), True)
    ])
training = sqlContext.createDataFrame(sc.parallelize(d), schema)
training = training.select(training['features'], training['lbl'])
# training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(training)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")
# mlrModel = mlr.fit(training)
# print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
# print("Multinomial intercepts: " + str(mlrModel.interceptVector))

In [None]:
#Choose the right threshold for spark.ml.lr model

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

In [None]:
all_col = ['features']
lbl_col = 'lbl'

df = pd.DataFrame(d)
df = pd.concat([df]*100).reset_index(drop=True)
df

In [None]:
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTENC.html

from imblearn.over_sampling import SMOTE

X_train_smote, y_train_smote = SMOTE(sampling_strategy=sampling_ratio, k_neighbors = k_neighbors, n_jobs = 4).fit_resample(X_train, y_train)

brf = BalancedRandomForestClassifier(n_estimators=150, random_state=0, n_jobs=-1, verbose=1)
brf.fit(X_train_smote, y_train_smote)

y_pred_brf = brf.predict(df[all_col])


In [None]:
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

train, y_train, test, y_test = training_df[numeric_col], training_df[lbl_col], testing_df[numeric_col], testing_df[lbl_col]

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]), metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

## Parameter Tuning
# model = xgb.XGBClassifier()
# param_dist = {"max_depth": [20,50],
#               "min_child_weight" : [3,6],
#               "n_estimators": [10,20],
#               "learning_rate": [0.1,0.16],}
# grid_search = GridSearchCV(model, param_grid=param_dist, cv = 2, verbose=10, n_jobs=-1)
# grid_search.fit(train, y_train)
# print(grid_search.best_estimator_)

max_depth, min_child_weight, n_estimators, learning_r = 25, 1, 200, 0.2
model = xgb.XGBClassifier(max_depth=max_depth, learning_rate=learning_r, min_child_weight=min_child_weight,  n_estimators=n_estimators, n_jobs=-1, verbose=1) 
model.fit(train,y_train)

# Save model to file
file_path = boosting_dataset_dir+"maxdepth_{}_minchildweight_{}_estimators_{}_lr_{}.joblib".format(max_depth,min_child_weight,n_estimators,learning_r) 
joblib.dump(model, file_path)
## Load model
# loaded_model = joblib.load(file_path)

print(auc(model, train, test))

predicted_probabilities = pd.Series(model.predict_proba(test)[:,1]).reset_index(drop=True)
