In [2]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType,FloatType,DoubleType

In [3]:
%local
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.5f}'.format

## 1 Read data

In [3]:
# read rate level df with labels
search_df = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/data_preparation/bk_rate_all_usd_df_with_label')
search_df = search_df.\
            filter(F.col("hotel_id")>0).\
            filter(F.col("src_rate_amount_usd").isNotNull())
print(search_df.count())

3887964

In [None]:
def getData(search_df,columns):
    # create data_df with features we need
    data_df = search_df.select(columns)
    # add rate number
    data_df = data_df.withColumn('rate_n',F.count('rate_index').\
                                over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))  
    # convert column types
    data_df = data_df.\
                 withColumn("rate_index", data_df.rate_index.cast("integer")).\
                withColumn("star_rating", data_df.star_rating.cast("float")).\
                withColumn("eligible_for_loyalty", data_df.eligible_for_loyalty.cast("integer")).\
                withColumn("free_breakfast", data_df.free_breakfast.cast("integer")).\
                withColumn("free_wifi", data_df.free_wifi.cast("integer")).\
                withColumn("free_parking", data_df.free_parking.cast("integer")).\
                withColumn("refundable", data_df.refundable.cast("integer"))
    return data_df

In [None]:
columns = ['hotel_id', 'check_in_date', 'check_out_date', 'tuid', 'rate_type', 'message_id', 'hotel_result_index', 'rate_index', 'message_date', 'score_1', 'city', 'star_rating', 'filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking', 'eligible_for_loyalty', 'free_breakfast', 'free_wifi', 'free_parking', 'refundable', 'src_rate_amount_usd', 'src_commission_base_usd', 'src_supply_revenue_usd','label']
data_df = getData(search_df,columns)

## 2 Fill missing values

In [20]:
from pyspark.ml.feature import Imputer

In [21]:
# check NA
data_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in data_df.columns]).show()

+--------+-------------+--------------+----+---------+----------+------------------+----------+------------+-------+----+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+-------------------+-----------------------+----------------------+-----+------+---------+
|hotel_id|check_in_date|check_out_date|tuid|rate_type|message_id|hotel_result_index|rate_index|message_date|score_1|city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|label|rate_n|row_index|
+--------+-------------+--------------+----+---------+----------+------------------+----------+------------+-------+----+-----------+--------------------------------+------

In [None]:
def fillNa(data_df,fillcol):
    # fill NA
    imputer = Imputer(inputCols=fillcol,
    outputCols=fillcol)
    data_df = imputer.fit(data_df).transform(data_df)
    return data_df

In [22]:
fillcol = ['star_rating']
data_df = fillNa(data_df,fillcol)

## 3 Feature transformation

In [23]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler,StandardScaler
def transformFeatures(categoricalColumns,booleanColumns,numericCols):
    stages = []
    for categoricalCol in categoricalColumns:
        # Category Indexing with StringIndexer
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")

        encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
        # Add stages.  These are not run here, but will run all at once later on.
        stages += [stringIndexer, encoder]


    assemblerInputs = [c + "classVec" for c in categoricalColumns] + booleanColumns + numericCols
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="_features")
    stages += [assembler]

    # Standardize Features
    scaler = StandardScaler(inputCol="_features", 
                                outputCol="features", 
                                withStd=True, withMean=False)
    stages += [scaler]
    return stages

In [None]:
categoricalColumns = []
#booleanColumns = ['filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking',
 #                 "eligible_for_loyalty","free_breakfast","free_wifi","free_parking","refundable"]
booleanColumns = ["eligible_for_loyalty","free_breakfast","free_wifi","free_parking","refundable"]
numericCols =["src_rate_amount_usd","rate_index","star_rating",'rate_n']
stages = transformFeatures(categoricalColumns,booleanColumns,numericCols)

## 4 Split train, test data

In [24]:
from pyspark.ml import Pipeline
def pipData(stages,data_df)
    pipeline = Pipeline(stages = stages)
    samples_df = data_df.select(['label','row_index'] + categoricalColumns + booleanColumns + numericCols)
    pipelineModel = pipeline.fit(samples_df)
    samples_tm_df = pipelineModel.transform(samples_df)
    return samples_tm_df

In [25]:
samples_tm_df = pipData(stages,data_df)
train, test = samples_tm_df.randomSplit([0.8, 0.2], seed = 917)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 3119666
Test Dataset Count: 780281

## Downsampling train data

In [None]:
def downSampling(df,downSamplingLabel,fraction):
    df_keep= df.where(F.col('label')==(1-downSamplingLabel))
    df_down=df.where(F.col('label')==downSamplingLabel).sample(False, fraction, seed = 917)
    df_final = df_keep.union(df_down)
    return df_final

In [26]:
train_final = downSampling(train,0,0.4)

In [30]:
train_final.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    1| 359271|
|    0|1104352|
+-----+-------+

## 5 Modeling and Evalution

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
def fitLogisticRegression(train):
    # initiate and train model
    lr = LogisticRegression(labelCol="label", featuresCol="features")
    lrModel = lr.fit(train)
    return lrModel

def evaluate(model, test):
    # get prediction
    predictions = model.transform(test)
    # evaluation
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",metricName="areaUnderROC")
    AUC = evaluator.evaluate(predictions)
    return predictions, AUC

### 5.1 Baseline: Logistic regression

In [35]:
lrModel = fitLogisticRegression(train)
predictions, AUC = evaluate(lrModel, test)
print(AUC)

0.7410485623542459

In [36]:
predictions.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|690657|
|       1.0|    0|   208|
|       0.0|    1| 88931|
|       1.0|    1|   485|
+----------+-----+------+

In [30]:
lrModel.coefficients

DenseVector([-0.1405, 0.1943, 0.1313, -0.258, 0.1077, -0.5577, -0.8095, 0.0812, -0.1401])

## 5.2 Baseline: Logistic regression with downsampling

In [35]:
lrModel = fitLogisticRegression(train_final)
predictions, AUC = evaluate(lrModel, test)
print(AUC)

0.7410485623542459

In [40]:
predictions.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|666394|
|       1.0|    0| 24471|
|       0.0|    1| 72499|
|       1.0|    1| 16917|
+----------+-----+------+

## 5.3 Random Forest with downsampling

In [41]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# initiate RF model
rf = RandomForestClassifier(featuresCol='features', labelCol='label')

# set parameters
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [4,6])
             .addGrid(rf.maxBins, [20,40])
             .addGrid(rf.numTrees, [100])
             .build())

# cross validation to tune parameters
def cv_train_model(model,paramGrid,evaluator,numFolds,trainData_new):
    cv = CrossValidator(estimator=model, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=numFolds)
    cv_Model = cv.fit(trainData_new)
    print (cv_Model.bestModel)
    return cv_Model

In [46]:
# train model
rfCvModel = cv_train_model(rf,paramGrid,evaluator,5,train_final)
# evaluate on test data
rf_cv_model_test_prediction, AUC = evaluate(rfCvModel,test)
print(AUC)

0.7565660886231497

In [47]:
rf_cv_model_test_prediction.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|675436|
|       1.0|    0| 15429|
|       0.0|    1| 73379|
|       1.0|    1| 16037|
+----------+-----+------+

In [None]:
sample_predictions = rf_cv_model_test_prediction.sample(False,0.1, seed = 917)

In [89]:
%%spark -o sample_predictions

### AOC

In [91]:
%%local
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
labels = sample_predictions["label"]
probabilities = sample_predictions["probability"]
prob = []
for dv in probabilities:
    prob.append(dv['values'][1])
fpr, tpr, thresholds = roc_curve(labels, prob, pos_label=1);
roc_auc = auc(fpr, tpr)

fig1 = plt.figure(figsize=(7,7))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([-0.01, 1.0]); plt.ylim([-0.01, 1.05]);
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate');
plt.title('ROC Curve'); plt.legend(loc="lower right");
plt.savefig('image/roc3.png')

### Distribution of probabilities

In [92]:
%%local
fig2 = plt.figure(figsize=(7,7))
#plt.hist(prob, bins=100, alpha=0.5)
sample_predictions['prob'] = prob
sample_predictions.groupby("label").prob.plot(kind='density', xlim=[-0.01,1.01])
#sample_predictions.prob.plot(kind='density', xlim=[-0.01,1.01])
plt.title('Distribution of predicted probabilities')
plt.xlabel('Predict. prob.'); plt.ylabel('Prob');
plt.legend(loc="lower right");
plt.xlim(0,1)
plt.savefig('image/prob3.png')

### Feature importance

In [48]:
# extract feature importance from RF model
def extractFeatureImportance(featureImp, df, featuresCol):
    list_extract = []
    # featuresCol: _features
    for i in df.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + df.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    feature_importance = varlist.sort_values('score', ascending = False)
    feature_importance['score'] = feature_importance['score'].round(4)
    feature_importance['name'] = list(map(lambda x: x.replace('_importantclassVec','').replace('classVec',''),feature_importance['name']))
    return feature_importance

In [50]:
# feature importance from RF model
feature_importance = extractFeatureImportance(rfCvModel.bestModel.featureImportances, 
                              rf_cv_model_test_prediction, "_features")
feature_importance

   idx                  name   score
6    6            rate_index  0.4731
8    8                rate_n  0.2618
5    5   src_rate_amount_usd  0.1246
7    7           star_rating  0.0653
0    0  eligible_for_loyalty  0.0268
3    3          free_parking  0.0207
1    1        free_breakfast  0.0148
2    2             free_wifi  0.0096
4    4            refundable  0.0032

In [None]:
def extractWeights(model,feature_importance):
    weights = model.coefficients
    features = feature_importance.sort_values(by=['idx'])['name'].values
    weightsDF = sqlContext.createDataFrame(sc.parallelize([(float(w),f) for w,f in zip(weights,features)]),['weights','features'])
    weightsDF = weightsDF.withColumn('weights_abs',F.abs(F.col('weights')))
    weightsDF = weightsDF.orderBy(["weights_abs"], ascending=False)
    weightsDF = weightsDF.drop("weights_abs")
    return weightsDF

In [51]:
# extract weights from LR model
weightsDF = extractWeights(lrModel,feature_importance)
weightsDF.show(len(features),False)

+--------------------+--------------------+
|weights             |features            |
+--------------------+--------------------+
|-0.7580464341332018 |rate_index          |
|-0.5986107251283161 |src_rate_amount_usd |
|-0.28631360350362706|free_parking        |
|-0.1942755570299147 |rate_n              |
|-0.19101800670441726|eligible_for_loyalty|
|0.14795973395282574 |free_breakfast      |
|0.136140129616197   |free_wifi           |
|0.09772318452500461 |refundable          |
|0.09125523433152447 |star_rating         |
+--------------------+--------------------+

## 6 Prediction

### 6.1 read rate level data

In [52]:
# read rate level df (last search, hotels below PL, both booked and not booked)
last_search_df = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/data_preparation/rate_all_usd_2019')

### 6.2 transform data

In [None]:
# Get data_df
columns = ['hotel_id', 'check_in_date', 'check_out_date', 'tuid', 'rate_type', 'message_id', 'hotel_result_index', 'rate_index', 'message_date', 'score_1', 'city', 'star_rating', 'filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking', 'eligible_for_loyalty', 'free_breakfast', 'free_wifi', 'free_parking', 'refundable', 'src_rate_amount_usd', 'src_commission_base_usd', 'src_supply_revenue_usd','label']
data_df = getData(last_search_df,columns)
# Fill NA
fillcol = ['star_rating']
data_df = fillNa(data_df,fillcol)

### 3 pip data into model

In [None]:
def predictData(model,data_df):
    # get prediction of the data_df
    pipelineModel = pipeline.fit(data_df)
    data_tm_df = pipelineModel.transform(data_df)
    prediction=model.transform(data_tm_df)
    # extract predicted rate-picking up probability from prediction
    unlist = F.udf(lambda x: float(list(x)[0]), DoubleType())
    prediction = prediction.withColumn('prob',unlist('probability'))
    return prediction

In [62]:
prediction = predictData(rfCvModel,data_df)

In [66]:
# calculate rate revenue
prediction = prediction.\
withColumn('rate_revenue',F.col('src_supply_revenue_usd')*F.col('prob'))

In [68]:
# save prediction
prediction_output = prediction.drop('_features','features','rawPrediction','probability')
dir = 's3://ege-ds-workshops-corp/yixli/prediction/'
prediction_output.repartition(1).write.mode('overwrite').parquet(dir+'prediction')