In [2]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType,FloatType,DoubleType

In [3]:
%local
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.5f}'.format

## Read data

In [4]:
search_df = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/data_preparation/bk_rate_all_usd_df_with_label')

# load_date between '20200301' and '20200401'

search_df = search_df.\
            filter(F.col("hotel_id")>0).\
            filter(F.col("src_rate_amount_usd").isNotNull())
print(search_df.count())

3899947

In [5]:
search_df.show(10)

+--------------------+--------+-------------+--------------+--------+---------+----------+--------------------+------------------+-----------+--------------+-------------------+----------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+-------------------+-----------------------+----------------------+-----+
|          message_id|hotel_id|check_in_date|check_out_date|    tuid|rate_type|rate_index|        message_date|hotel_result_index|hotel_index|bk_hotel_index|            score_1|            city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|label|
+--------------------+--------+-------------+--------------+

In [6]:
search_df.printSchema()

root
 |-- message_id: string (nullable = true)
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: date (nullable = true)
 |-- check_out_date: date (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- hotel_index: integer (nullable = true)
 |-- bk_hotel_index: integer (nullable = true)
 |-- score_1: double (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: b

In [7]:
search_df.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    1| 448687|
|    0|3451260|
+-----+-------+

In [8]:
search_df.groupBy("eligible_for_loyalty").count().show()

+--------------------+-------+
|eligible_for_loyalty|  count|
+--------------------+-------+
|                true| 655551|
|               false|3244396|
+--------------------+-------+

In [9]:
search_df.groupBy("free_breakfast").count().show()

+--------------+-------+
|free_breakfast|  count|
+--------------+-------+
|          true|1933153|
|         false|1966794|
+--------------+-------+

In [10]:
search_df.groupBy("free_wifi").count().show()

+---------+-------+
|free_wifi|  count|
+---------+-------+
|     true|3584138|
|    false| 315809|
+---------+-------+

In [11]:
search_df.groupBy("free_parking").count().show()

+------------+-------+
|free_parking|  count|
+------------+-------+
|        true|1669856|
|       false|2230091|
+------------+-------+

In [12]:
search_df.groupBy("refundable").count().show()

+----------+-------+
|refundable|  count|
+----------+-------+
|      true|2835990|
|     false|1063957|
+----------+-------+

In [13]:
search_df.groupBy("rate_type").count().show()

+---------+-------+
|rate_type|  count|
+---------+-------+
|     EPRA| 222840|
|     EPRM| 464910|
|     ESRM|1876222|
|     ESRA| 804932|
|      GDS| 531043|
+---------+-------+

In [93]:
search_sample = search_df.select('rate_type','src_supply_revenue_usd').sample(False, 0.1, 917)

In [94]:
%%spark -o search_sample

In [97]:
%local
fig3 = plt.figure(figsize=(7,7))
search_sample = search_sample.sort_values(by=['rate_type'])
fig3 = sns.boxplot(x='rate_type', y='src_supply_revenue_usd',data=search_sample)
fig3.set(xlabel='rate_type', ylabel='rate_revenue(USD)')
fig3.set_ylim(bottom=0,top=80)
plt.tight_layout()
plt.savefig('image/rate_revenue_by_type.png')

In [102]:
%local
search_sample = search_sample.groupby('rate_type').agg({'src_supply_revenue_usd':np.median}).reset_index()

In [103]:
%local
search_sample





VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

In [14]:
data_df = search_df.select('hotel_id', 'check_in_date', 'check_out_date', 'tuid', 'rate_type', 'message_id', 'hotel_result_index', 'rate_index', 'message_date', 'score_1', 'city', 'star_rating', 'filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking', 'eligible_for_loyalty', 'free_breakfast', 'free_wifi', 'free_parking', 'refundable', 'src_rate_amount_usd', 'src_commission_base_usd', 'src_supply_revenue_usd','label')
                         

In [15]:
data_df.count()

3899947

In [16]:
data_df.printSchema()

root
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: date (nullable = true)
 |-- check_out_date: date (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- message_id: string (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- score_1: double (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: boolean (nullable = true)
 |-- free_parking: boolean (nullable = true)
 |-- refundable: bool

In [17]:
data_df = data_df.withColumn('rate_n',F.count('rate_index').\
                                                  over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [18]:
data_df = data_df.withColumn('row_index', row_number().over(Window.orderBy(F.monotonically_increasing_id())))

In [19]:
data_df = data_df.\
             withColumn("rate_index", data_df.rate_index.cast("integer")).\
            withColumn("star_rating", data_df.star_rating.cast("float")).\
            withColumn("eligible_for_loyalty", data_df.eligible_for_loyalty.cast("integer")).\
            withColumn("free_breakfast", data_df.free_breakfast.cast("integer")).\
            withColumn("free_wifi", data_df.free_wifi.cast("integer")).\
            withColumn("free_parking", data_df.free_parking.cast("integer")).\
            withColumn("refundable", data_df.refundable.cast("integer"))

## Fill missing values

In [20]:
from pyspark.ml.feature import Imputer

In [21]:
data_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in data_df.columns]).show()

+--------+-------------+--------------+----+---------+----------+------------------+----------+------------+-------+----+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+-------------------+-----------------------+----------------------+-----+------+---------+
|hotel_id|check_in_date|check_out_date|tuid|rate_type|message_id|hotel_result_index|rate_index|message_date|score_1|city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|label|rate_n|row_index|
+--------+-------------+--------------+----+---------+----------+------------------+----------+------------+-------+----+-----------+--------------------------------+------

In [22]:
imputer = Imputer(inputCols=['star_rating'],
outputCols=['star_rating'])
data_df = imputer.fit(data_df).transform(data_df)

## Features

In [23]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler,StandardScaler

categoricalColumns = []
#booleanColumns = ['filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking',
 #                 "eligible_for_loyalty","free_breakfast","free_wifi","free_parking","refundable"]
booleanColumns = ["eligible_for_loyalty","free_breakfast","free_wifi","free_parking","refundable"]
numericCols =["src_rate_amount_usd","rate_index","star_rating",'rate_n']
stages = [] # stages in our Pipeline

for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]


assemblerInputs = [c + "classVec" for c in categoricalColumns] + booleanColumns + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="_features")
stages += [assembler]

# Standardize Features
scaler = StandardScaler(inputCol="_features", 
                            outputCol="features", 
                            withStd=True, withMean=False)
stages += [scaler]



## Train, test data

In [24]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)

samples_df = data_df.select(['label','row_index'] + categoricalColumns + booleanColumns + numericCols)

pipelineModel = pipeline.fit(samples_df)
samples_tm_df = pipelineModel.transform(samples_df)

In [25]:
train, test = samples_tm_df.randomSplit([0.8, 0.2], seed = 917)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 3119666
Test Dataset Count: 780281

## downsampling Train data

In [26]:
train_1= train.where(F.col('label')==1)

In [27]:
train_0=train.where(F.col('label')==0).sample(False, 0.4, seed = 917)

In [28]:
train_final = train_0.union(train_1)

In [29]:
train_final.count()

1463623

In [30]:
train_final.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    1| 359271|
|    0|1104352|
+-----+-------+

In [31]:
train.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    0|2760395|
|    1| 359271|
+-----+-------+

In [32]:
test.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|    0|690865|
|    1| 89416|
+-----+------+

## Baseline: Logistic regression

In [33]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features")
lrModel = lr.fit(train)

In [34]:
predictions = lrModel.transform(test)

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",metricName="areaUnderROC")
evaluator.evaluate(predictions)

0.7410485623542459

In [36]:
predictions.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|690657|
|       1.0|    0|   208|
|       0.0|    1| 88931|
|       1.0|    1|   485|
+----------+-----+------+

In [30]:
lrModel.coefficients

DenseVector([-0.1405, 0.1943, 0.1313, -0.258, 0.1077, -0.5577, -0.8095, 0.0812, -0.1401])

## Baseline: Logistic regression with downsampling

In [37]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features")
lrModel = lr.fit(train_final)

In [38]:
predictions = lrModel.transform(test)

In [39]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",metricName="areaUnderROC")
evaluator.evaluate(predictions)

0.7411949428892107

In [40]:
predictions.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|666394|
|       1.0|    0| 24471|
|       0.0|    1| 72499|
|       1.0|    1| 16917|
+----------+-----+------+

## Random Forest

In [41]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Initiate RF model
rf = RandomForestClassifier(featuresCol='features', labelCol='label')

# Build evaluator
evaluator =  BinaryClassificationEvaluator()

# Set parameters
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [4,6])
             .addGrid(rf.maxBins, [20,40])
             .addGrid(rf.numTrees, [100])
             .build())

In [42]:
def cv_train_model(model,paramGrid,evaluator,numFolds,trainData_new):
    cv = CrossValidator(estimator=model, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=numFolds)
    cv_Model = cv.fit(trainData_new)
    print (cv_Model.bestModel)
    return cv_Model

In [43]:
rfCvModel = cv_train_model(rf,paramGrid,evaluator,5,train_final)

RandomForestClassificationModel (uid=RandomForestClassifier_c5c95cdca4ff) with 100 trees

## Evaluate

In [44]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

In [45]:
# Evaluate on training data
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",metricName="areaUnderROC")
rf_cv_model_train_prediction = rfCvModel.transform(train_final)
evaluator.evaluate(rf_cv_model_train_prediction)

0.7561615231337737

In [46]:
rf_cv_model_test_prediction = rfCvModel.transform(test)
evaluator.evaluate(rf_cv_model_test_prediction)

0.7565660886231497

In [47]:
rf_cv_model_test_prediction.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|675436|
|       1.0|    0| 15429|
|       0.0|    1| 73379|
|       1.0|    1| 16037|
+----------+-----+------+

In [89]:
%%spark -o sample_predictions

### AOC

In [91]:
%%local
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
labels = sample_predictions["label"]
probabilities = sample_predictions["probability"]
prob = []
for dv in probabilities:
    prob.append(dv['values'][1])
fpr, tpr, thresholds = roc_curve(labels, prob, pos_label=1);
roc_auc = auc(fpr, tpr)

fig1 = plt.figure(figsize=(7,7))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([-0.01, 1.0]); plt.ylim([-0.01, 1.05]);
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate');
plt.title('ROC Curve'); plt.legend(loc="lower right");
plt.savefig('image/roc3.png')

### Distribution of probabilities

In [92]:
%%local
fig2 = plt.figure(figsize=(7,7))
#plt.hist(prob, bins=100, alpha=0.5)
sample_predictions['prob'] = prob
sample_predictions.groupby("label").prob.plot(kind='density', xlim=[-0.01,1.01])
#sample_predictions.prob.plot(kind='density', xlim=[-0.01,1.01])
plt.title('Distribution of predicted probabilities')
plt.xlabel('Predict. prob.'); plt.ylabel('Prob');
plt.legend(loc="lower right");
plt.xlim(0,1)
plt.savefig('image/prob3.png')

In [96]:
%%local
sample_predictions['prob'].head(50)

0     0.279381
1     0.325438
2     0.089923
3     0.137710
4     0.176192
5     0.096528
6     0.088336
7     0.443244
8     0.088336
9     0.092025
10    0.170920
11    0.091934
12    0.090866
13    0.090990
14    0.090990
15    0.123913
16    0.090677
17    0.097914
18    0.095333
19    0.173949
20    0.081866
21    0.091432
22    0.097769
23    0.089016
24    0.082004
25    0.090106
26    0.090096
27    0.082861
28    0.086894
29    0.118066
30    0.125085
31    0.275512
32    0.090990
33    0.183322
34    0.089239
35    0.086169
36    0.097769
37    0.135722
38    0.190099
39    0.158392
40    0.091159
41    0.132281
42    0.122867
43    0.099318
44    0.091159
45    0.080789
46    0.095902
47    0.198132
48    0.086894
49    0.097769
Name: prob, dtype: float64

## Feature importance

In [48]:
def extract_feature_importance(featureImp, df, featuresCol):
    list_extract = []
    # featuresCol: _features
    for i in df.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + df.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [49]:
feature_importance = extract_feature_importance(rfCvModel.bestModel.featureImportances, 
                          rf_cv_model_test_prediction, "_features")
feature_importance['score'] = feature_importance['score'].round(4)
feature_importance['name'] = list(map(lambda x: x.replace('_importantclassVec','').replace('classVec',''),feature_importance['name']))

In [50]:
feature_importance

   idx                  name   score
6    6            rate_index  0.4731
8    8                rate_n  0.2618
5    5   src_rate_amount_usd  0.1246
7    7           star_rating  0.0653
0    0  eligible_for_loyalty  0.0268
3    3          free_parking  0.0207
1    1        free_breakfast  0.0148
2    2             free_wifi  0.0096
4    4            refundable  0.0032

In [51]:
weights = lrModel.coefficients
features = feature_importance.sort_values(by=['idx'])['name'].values
weightsDF = sqlContext.createDataFrame(sc.parallelize([(float(w),f) for w,f in zip(weights,features)]),['weights','features'])
weightsDF = weightsDF.withColumn('weights_abs',F.abs(F.col('weights')))
weightsDF = weightsDF.orderBy(["weights_abs"], ascending=False)
weightsDF = weightsDF.drop("weights_abs")
weightsDF.show(len(features),False)

+--------------------+--------------------+
|weights             |features            |
+--------------------+--------------------+
|-0.7580464341332018 |rate_index          |
|-0.5986107251283161 |src_rate_amount_usd |
|-0.28631360350362706|free_parking        |
|-0.1942755570299147 |rate_n              |
|-0.19101800670441726|eligible_for_loyalty|
|0.14795973395282574 |free_breakfast      |
|0.136140129616197   |free_wifi           |
|0.09772318452500461 |refundable          |
|0.09125523433152447 |star_rating         |
+--------------------+--------------------+

## Prediction

### read last search data

In [52]:
last_search_df = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/data_preparation/rate_all_usd_2019')

In [53]:
last_search_df.printSchema()

root
 |-- message_id: string (nullable = true)
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: date (nullable = true)
 |-- check_out_date: date (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- hotel_index: integer (nullable = true)
 |-- bk_hotel_index: integer (nullable = true)
 |-- score_1: double (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: b

### transform data

In [54]:
data_df = last_search_df.select('hotel_id', 'check_in_date', 'check_out_date', 'tuid', 'rate_type', 'message_id', 'hotel_index','hotel_result_index', 'rate_index', 'message_date', 'score_1', 'city', 'star_rating', 'filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking', 'eligible_for_loyalty', 'free_breakfast', 'free_wifi', 'free_parking', 'refundable', 'bk_hotel_index', 'src_rate_amount_usd', 'src_commission_base_usd', 'src_supply_revenue_usd')
                         

In [55]:
data_df = data_df.withColumn('rate_n',F.count('rate_index').\
                                                  over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [56]:
data_df = data_df.withColumn('row_index', row_number().over(Window.orderBy(F.monotonically_increasing_id())))

In [57]:
data_df.show()

+--------+-------------+--------------+--------+---------+--------------------+-----------+------------------+----------+--------------------+--------------------+------------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+--------------+-------------------+-----------------------+----------------------+------+---------+
|hotel_id|check_in_date|check_out_date|    tuid|rate_type|          message_id|hotel_index|hotel_result_index|rate_index|        message_date|             score_1|              city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|bk_hotel_index|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|rate_n|row_index|
+--------+-------------+--------

In [58]:
data_df = data_df.\
             withColumn("rate_index", data_df.rate_index.cast("integer")).\
            withColumn("star_rating", data_df.star_rating.cast("float")).\
            withColumn("eligible_for_loyalty", data_df.eligible_for_loyalty.cast("boolean").cast("integer")).\
            withColumn("free_breakfast", data_df.free_breakfast.cast("boolean").cast("integer")).\
            withColumn("free_wifi", data_df.free_wifi.cast("boolean").cast("integer")).\
            withColumn("free_parking", data_df.free_parking.cast("boolean").cast("integer")).\
            withColumn("refundable", data_df.refundable.cast("boolean").cast("integer"))

In [59]:
data_df.count()

72917266

In [60]:
imputer = Imputer(inputCols=['star_rating'],
outputCols=['star_rating'])
data_df = imputer.fit(data_df).transform(data_df)

In [61]:
data_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in data_df.columns]).show()

+--------+-------------+--------------+----+---------+----------+-----------+------------------+----------+------------+-------+----+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+--------------+-------------------+-----------------------+----------------------+------+---------+
|hotel_id|check_in_date|check_out_date|tuid|rate_type|message_id|hotel_index|hotel_result_index|rate_index|message_date|score_1|city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|bk_hotel_index|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|rate_n|row_index|
+--------+-------------+--------------+----+---------+----------+-----------+------------------+----------+------------+-------+--

### pip data into model

In [62]:
pipelineModel = pipeline.fit(data_df)
data_tm_df = pipelineModel.transform(data_df)

In [63]:
prediction=rfCvModel.transform(data_tm_df)

In [64]:
prediction.count()

72917266

In [65]:
unlist = F.udf(lambda x: float(list(x)[0]), DoubleType())
prediction = prediction.withColumn('prob',unlist('probability'))

In [66]:
prediction = prediction.\
withColumn('rate_revenue',F.col('src_supply_revenue_usd')*F.col('prob'))

In [67]:
prediction_output = prediction.drop('_features','features','rawPrediction','probability')

In [None]:
dir = 's3://ege-ds-workshops-corp/yixli/prediction/'
prediction_output.repartition(1).write.mode('overwrite').parquet(dir+'prediction')