In [2]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType,FloatType,DoubleType

In [None]:
%local
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.5f}'.format

## Read data

In [3]:
file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-15-2020_booking_search_df_label.csv"
print("Collecting search the data...")
search_df = sqlContext.read.format('csv').\
            options(header='True', inferSchema='True', delimiter=',').\
            load(file_loc)

# load_date between '20200301' and '20200401'

search_df = search_df.\
            filter(F.col("hotel_id")>0).\
            filter(F.col("src_rate_amount_usd").isNotNull())
print(search_df.count())

Collecting search the data...
1905038

In [4]:
search_df.show(10)

+--------------------+--------+-------------------+-------------------+--------+---------+----------+------------------+--------------------+-----------+----------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+-------------------+-----------------------+----------------------+-------------------+-----+
|          message_id|hotel_id|      check_in_date|     check_out_date|    tuid|rate_type|rate_index|hotel_result_index|        message_date|hotel_index|            city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|            score_1|label|
+--------------------+--------+-------------------+-----------------

In [5]:
search_df.printSchema()

root
 |-- message_id: string (nullable = true)
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: timestamp (nullable = true)
 |-- check_out_date: timestamp (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_index: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: boolean (nullable = true)
 |-- free_parking: boolean (nullable = true)
 |-- r

In [6]:
search_df.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    1| 208654|
|    0|1696384|
+-----+-------+

In [7]:
search_df.groupBy("eligible_for_loyalty").count().show()

+--------------------+-------+
|eligible_for_loyalty|  count|
+--------------------+-------+
|                true| 329694|
|               false|1575344|
+--------------------+-------+

In [8]:
search_df.groupBy("free_breakfast").count().show()

+--------------+------+
|free_breakfast| count|
+--------------+------+
|          true|929894|
|         false|975144|
+--------------+------+

In [9]:
search_df.groupBy("free_wifi").count().show()

+---------+-------+
|free_wifi|  count|
+---------+-------+
|     true|1744806|
|    false| 160232|
+---------+-------+

In [10]:
search_df.groupBy("free_parking").count().show()

+------------+-------+
|free_parking|  count|
+------------+-------+
|        true| 741489|
|       false|1163549|
+------------+-------+

In [11]:
search_df.groupBy("refundable").count().show()

+----------+-------+
|refundable|  count|
+----------+-------+
|      true|1401978|
|     false| 503060|
+----------+-------+

In [12]:
search_df.groupBy("rate_type").count().show()

+---------+------+
|rate_type| count|
+---------+------+
|     EPRA|113646|
|     EPRM|238846|
|     ESRM|900725|
|     ESRA|378476|
|      GDS|273345|
+---------+------+

In [13]:
data_df = search_df.select('hotel_id', 'check_in_date', 'check_out_date', 'tuid', 'rate_type', 'message_id', 'hotel_result_index', 'rate_index', 'message_date', 'score_1', 'city', 'star_rating', 'filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking', 'eligible_for_loyalty', 'free_breakfast', 'free_wifi', 'free_parking', 'refundable', 'src_rate_amount_usd', 'src_commission_base_usd', 'src_supply_revenue_usd','label')
                         

In [14]:
data_df.count()

1905038

In [15]:
data_df.printSchema()

root
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: timestamp (nullable = true)
 |-- check_out_date: timestamp (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- message_id: string (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- score_1: double (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: boolean (nullable = true)
 |-- free_parking: boolean (nullable = true)
 |-- refund

In [16]:
data_df = data_df.withColumn('rate_n',F.count('rate_index').\
                                                  over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [17]:
data_df = data_df.withColumn('row_index', row_number().over(Window.orderBy(F.monotonically_increasing_id())))

In [18]:
data_df = data_df.\
             withColumn("rate_index", data_df.rate_index.cast("integer")).\
            withColumn("star_rating", data_df.star_rating.cast("float")).\
            withColumn("eligible_for_loyalty", data_df.eligible_for_loyalty.cast("integer")).\
            withColumn("free_breakfast", data_df.free_breakfast.cast("integer")).\
            withColumn("free_wifi", data_df.free_wifi.cast("integer")).\
            withColumn("free_parking", data_df.free_parking.cast("integer")).\
            withColumn("refundable", data_df.refundable.cast("integer"))

## Fill missing values

In [19]:
from pyspark.ml.feature import Imputer

In [20]:
data_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in data_df.columns]).show()

+--------+-------------+--------------+----+---------+----------+------------------+----------+------------+-------+----+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+-------------------+-----------------------+----------------------+-----+------+---------+
|hotel_id|check_in_date|check_out_date|tuid|rate_type|message_id|hotel_result_index|rate_index|message_date|score_1|city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|label|rate_n|row_index|
+--------+-------------+--------------+----+---------+----------+------------------+----------+------------+-------+----+-----------+--------------------------------+------

In [21]:
imputer = Imputer(inputCols=['star_rating'],
outputCols=['star_rating'])
data_df = imputer.fit(data_df).transform(data_df)

## Features

In [22]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler,StandardScaler

categoricalColumns = []
#booleanColumns = ['filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking',
 #                 "eligible_for_loyalty","free_breakfast","free_wifi","free_parking","refundable"]
booleanColumns = ["eligible_for_loyalty","free_breakfast","free_wifi","free_parking","refundable"]
numericCols =["src_rate_amount_usd","rate_index","star_rating",'rate_n']
stages = [] # stages in our Pipeline

for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]


assemblerInputs = [c + "classVec" for c in categoricalColumns] + booleanColumns + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="_features")
stages += [assembler]

# Standardize Features
scaler = StandardScaler(inputCol="_features", 
                            outputCol="features", 
                            withStd=True, withMean=False)
stages += [scaler]



## Train, test data

In [23]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)

samples_df = data_df.select(['label','row_index'] + categoricalColumns + booleanColumns + numericCols)

pipelineModel = pipeline.fit(samples_df)
samples_tm_df = pipelineModel.transform(samples_df)

In [24]:
train, test = samples_tm_df.randomSplit([0.8, 0.2], seed = 917)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 1523403
Test Dataset Count: 381635

## downsampling Train data

In [25]:
train_1= train.where(F.col('label')==1)

In [26]:
train_0=train.where(F.col('label')==0).sample(False, 0.8, seed = 917)

In [27]:
train_final = train_0.union(train_1)

In [28]:
train_final.count()

1251393

In [29]:
train_final.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    1| 167143|
|    0|1084250|
+-----+-------+

In [30]:
train.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    0|1356260|
|    1| 167143|
+-----+-------+

In [31]:
test.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|    0|340124|
|    1| 41511|
+-----+------+

## Baseline: Logistic regression

In [26]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features")
lrModel = lr.fit(train)

In [27]:
predictions = lrModel.transform(test)

In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",metricName="areaUnderROC")
evaluator.evaluate(predictions)

0.7326286182347924

In [29]:
predictions.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|340112|
|       1.0|    0|    12|
|       0.0|    1| 41492|
|       1.0|    1|    19|
+----------+-----+------+

In [30]:
lrModel.coefficients

DenseVector([-0.1405, 0.1943, 0.1313, -0.258, 0.1077, -0.5577, -0.8095, 0.0812, -0.1401])

## Baseline: Logistic regression with downsampling

In [164]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features")
lrModel = lr.fit(train_final)

In [165]:
predictions = lrModel.transform(test)

In [166]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",metricName="areaUnderROC")
evaluator.evaluate(predictions)

0.7731538738431155

In [195]:
predictions.groupBy('prediction','label').count().show()

+----------+-----+-----+
|prediction|label|count|
+----------+-----+-----+
|       0.0|    0|51843|
|       1.0|    0|  268|
|       0.0|    1| 7023|
|       1.0|    1|  412|
+----------+-----+-----+

## Random Forest

In [32]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Initiate RF model
rf = RandomForestClassifier(featuresCol='features', labelCol='label')

# Build evaluator
evaluator =  BinaryClassificationEvaluator()

# Set parameters
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [4,6])
             .addGrid(rf.maxBins, [20,40])
             .addGrid(rf.numTrees, [100])
             .build())

In [33]:
def cv_train_model(model,paramGrid,evaluator,numFolds,trainData_new):
    cv = CrossValidator(estimator=model, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=numFolds)
    cv_Model = cv.fit(trainData_new)
    print (cv_Model.bestModel)
    return cv_Model

In [34]:
rfCvModel = cv_train_model(rf,paramGrid,evaluator,5,train_final)

RandomForestClassificationModel (uid=RandomForestClassifier_e12de6e9e80a) with 100 trees

## Evaluate

In [35]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

In [36]:
# Evaluate on training data
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",metricName="areaUnderROC")
rf_cv_model_train_prediction = rfCvModel.transform(train_final)
evaluator.evaluate(rf_cv_model_train_prediction)

0.7353243454288807

In [37]:
rf_cv_model_test_prediction = rfCvModel.transform(test)
evaluator.evaluate(rf_cv_model_test_prediction)

0.7351254232301984

In [38]:
rf_cv_model_test_prediction.groupBy('prediction','label').count().show()

+----------+-----+------+
|prediction|label| count|
+----------+-----+------+
|       0.0|    0|339659|
|       1.0|    0|   465|
|       0.0|    1| 40222|
|       1.0|    1|  1289|
+----------+-----+------+

In [39]:
sample_predictions = rf_cv_model_test_prediction.sample(False, 0.1, 917).select(['probability','prediction','label'])

In [40]:
sample_predictions.groupBy('prediction','label').count().show()

+----------+-----+-----+
|prediction|label|count|
+----------+-----+-----+
|       0.0|    0|34184|
|       1.0|    0|   61|
|       0.0|    1| 4058|
|       1.0|    1|  124|
+----------+-----+-----+

In [93]:
%%spark -o sample_predictions

### AOC

In [94]:
%%local
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
labels = sample_predictions["label"]
probabilities = sample_predictions["probability"]
prob = []
for dv in probabilities:
    prob.append(dv['values'][1])
fpr, tpr, thresholds = roc_curve(labels, prob, pos_label=1);
roc_auc = auc(fpr, tpr)

fig1 = plt.figure(figsize=(7,7))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([-0.01, 1.0]); plt.ylim([-0.01, 1.05]);
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate');
plt.title('ROC Curve'); plt.legend(loc="lower right");
plt.savefig('image/roc3.png')


No positive samples in y_true, true positive value should be meaningless



### Distribution of probabilities

In [95]:
%%local
fig2 = plt.figure(figsize=(7,7))
#plt.hist(prob, bins=100, alpha=0.5)
sample_predictions['prob'] = prob
#sample_predictions.groupby("label").prob.plot(kind='density', xlim=[-0.01,1.01])
sample_predictions.prob.plot(kind='density', xlim=[-0.01,1.01])
plt.title('Distribution of predicted probabilities')
plt.xlabel('Predict. prob.'); plt.ylabel('Prob');
plt.legend(loc="lower right");
plt.xlim(0,1)
plt.savefig('image/prob3.png')

In [96]:
%%local
sample_predictions['prob'].head(50)

0     0.279381
1     0.325438
2     0.089923
3     0.137710
4     0.176192
5     0.096528
6     0.088336
7     0.443244
8     0.088336
9     0.092025
10    0.170920
11    0.091934
12    0.090866
13    0.090990
14    0.090990
15    0.123913
16    0.090677
17    0.097914
18    0.095333
19    0.173949
20    0.081866
21    0.091432
22    0.097769
23    0.089016
24    0.082004
25    0.090106
26    0.090096
27    0.082861
28    0.086894
29    0.118066
30    0.125085
31    0.275512
32    0.090990
33    0.183322
34    0.089239
35    0.086169
36    0.097769
37    0.135722
38    0.190099
39    0.158392
40    0.091159
41    0.132281
42    0.122867
43    0.099318
44    0.091159
45    0.080789
46    0.095902
47    0.198132
48    0.086894
49    0.097769
Name: prob, dtype: float64

## Feature importance

In [41]:
def extract_feature_importance(featureImp, df, featuresCol):
    list_extract = []
    # featuresCol: _features
    for i in df.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + df.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [42]:
feature_importance = extract_feature_importance(rfCvModel.bestModel.featureImportances, 
                          rf_cv_model_test_prediction, "_features")
feature_importance['score'] = feature_importance['score'].round(4)
feature_importance['name'] = list(map(lambda x: x.replace('_importantclassVec','').replace('classVec',''),feature_importance['name']))

In [43]:
feature_importance

   idx                  name   score
6    6            rate_index  0.4207
8    8                rate_n  0.3401
5    5   src_rate_amount_usd  0.1351
7    7           star_rating  0.0491
0    0  eligible_for_loyalty  0.0190
3    3          free_parking  0.0143
1    1        free_breakfast  0.0096
2    2             free_wifi  0.0085
4    4            refundable  0.0037

In [41]:
weights = lrModel.coefficients
features = feature_importance.sort_values(by=['idx'])['name'].values
weightsDF = sqlContext.createDataFrame(sc.parallelize([(float(w),f) for w,f in zip(weights,features)]),['weights','features'])
weightsDF = weightsDF.withColumn('weights_abs',F.abs(F.col('weights')))
weightsDF = weightsDF.orderBy(["weights_abs"], ascending=False)
weightsDF = weightsDF.drop("weights_abs")
weightsDF.show(len(features),False)

+--------------------+--------------------+
|weights             |features            |
+--------------------+--------------------+
|-0.8094602753722356 |rate_index          |
|-0.5577338679701562 |src_rate_amount_usd |
|-0.2579508157967993 |free_parking        |
|0.19426739534977197 |free_breakfast      |
|-0.14050021928435147|eligible_for_loyalty|
|-0.14010274984564297|rate_n              |
|0.13132930238483456 |free_wifi           |
|0.10773519280113225 |refundable          |
|0.08115696961609643 |star_rating         |
+--------------------+--------------------+

## Revenue Estimation

### Distribution

In [47]:
pipelineModel = pipeline.fit(data_df)
data_tm_df = pipelineModel.transform(data_df)

In [48]:
prediction=rfCvModel.transform(data_tm_df)

In [49]:
prediction.count()

626342

In [50]:
unlist = F.udf(lambda x: float(list(x)[0]), DoubleType())
prediction = prediction.withColumn('prob',unlist('probability'))

In [51]:
prediction = prediction.\
withColumn('rate_revenue',F.col('src_supply_revenue_usd')*F.col('prob'))

In [52]:
prediction.groupBy('rate_index').agg(F.avg('rate_revenue')).show()

+----------+------------------+
|rate_index| avg(rate_revenue)|
+----------+------------------+
|        12| 28.83182900985359|
|         1| 16.91943537783474|
|        13| 31.01234302352645|
|         6| 22.82916454626308|
|         3| 19.66595451684039|
|         5| 21.20914505655305|
|        15|27.781647165293577|
|         9|25.299044825786595|
|         4| 20.49437485992719|
|         8|26.347761419022756|
|         7| 24.71274292593404|
|        10|27.769850834028922|
|        11|30.194028476709857|
|        14|33.871021014550976|
|         2|19.086816367428007|
|         0|16.981607203050803|
+----------+------------------+

In [53]:
prediction.groupBy('rate_index').agg(F.avg('prob')).show()

+----------+------------------+
|rate_index|         avg(prob)|
+----------+------------------+
|        12|0.8622389454502336|
|         1|0.7502280411264969|
|        13|0.8537654304733346|
|         6|0.8626642558968243|
|         3|0.8513638587915634|
|         5|0.8672811815904204|
|        15|0.8622388717281845|
|         9|0.8624870487612759|
|         4|0.8716466568975798|
|         8|0.8606705267798901|
|         7|0.8593412080040556|
|        10|0.8626716901996918|
|        11|0.8651692593011434|
|        14|0.8609508675856052|
|         2|0.8483944872003494|
|         0|0.7310739912784043|
+----------+------------------+

### Hotel Revenue

In [57]:
prediction = prediction.withColumn('hotel_revenue',F.sum('rate_revenue').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [58]:
prediction = prediction.withColumn('avg_hotel_revenue',F.col('hotel_revenue')/F.col('rate_n'))

In [59]:
prediction.groupBy('rate_n').agg(F.avg('hotel_revenue')).show()

+------+------------------+
|rate_n|avg(hotel_revenue)|
+------+------------------+
|    26| 603.8902043972981|
|    29| 833.8151694814751|
|    65|1242.4796288109503|
|    19| 482.9106341353049|
|    54|1460.6621186212285|
|   112|2125.3521509345615|
|    22| 523.7344539313586|
|   196|  560.696132480702|
|     7|145.25917143582183|
|    77|1446.8746241860304|
|    34| 784.7667281611959|
|   184|1403.3732164000162|
|   126| 2213.054178504973|
|    50| 928.2385198014697|
|   110|2076.9903440686326|
|   190|1587.0596468542012|
|    57|1022.1995036414669|
|    32| 786.9972061763493|
|    43| 917.5051128294874|
|    84|1473.0890284644922|
+------+------------------+
only showing top 20 rows

In [312]:
prdiction_output = prediction.drop('rate_typeIndex','rate_typeclassVec','_features','features','rawPrediction','probability')

In [313]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'
datestamp = datetime.datetime.now().strftime('%m-%d-%Y')
prdiction_output.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_prediction.csv',header = 'true')

In [60]:
sample_predictions = prediction.select("message_id","hotel_id",'tuid',"check_in_date","check_out_date",'rate_index','rate_type','prob','rate_revenue','rate_n','hotel_revenue')

In [61]:
%%spark -o sample_predictions

In [62]:
%%local
fig3 = plt.figure(figsize=(7,7))
sample_predictions = sample_predictions.sort_values(by=['rate_type'])
data = sample_predictions['rate_n']
fig3 = sns.distplot(data,hist=True)
fig3.set(xlabel='rate_n', ylabel='distribution')
fig3.set_xlim(0,60)
plt.tight_layout()
plt.savefig('image/rate_n.png')

In [63]:
%%local
fig3 = plt.figure(figsize=(7,7))
sample_predictions = sample_predictions.sort_values(by=['rate_type'])
fig3 = sns.boxplot(x='rate_type', y='rate_revenue',data=sample_predictions)
fig3.set(xlabel='rate_type', ylabel='rate_revenue')
fig3.set_ylim(bottom=0,top=80)
plt.tight_layout()
plt.savefig('image/rate_revenue_by_type.png')

In [64]:
%%local
fig3 = plt.figure(figsize=(12,7))
sample_predictions = sample_predictions.sort_values(by=['rate_index'])
fig3 = sns.boxplot(x='rate_index', y='rate_revenue',data=sample_predictions,hue='rate_type')
fig3.set(xlabel='rate_index', ylabel='rate_revenue')
fig3.set_ylim(bottom=0,top=80)
fig3.set_xlim(-0.5,5.5)
plt.tight_layout()
plt.savefig('image/rate_revenue_by_index.png')

In [65]:
%%local
fig4 = plt.figure(figsize=(7,7))
sample_predictions = sample_predictions.sort_values(by=['rate_n'])
fig4 = sns.scatterplot(x='rate_n', y='hotel_revenue',data=sample_predictions)
fig4.set(xlabel='rate_n', ylabel='hotel_revenue')
fig4.set_ylim(bottom=0,top=500)
fig4.set_xlim(0,15)
plt.tight_layout()
plt.savefig('image/hotel_revenue_by_n.png')

## Take top three rates

In [66]:
top_three_prediction = prediction.withColumn('rate_rn',row_number().over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid').orderBy(desc('prob')))).\
filter(F.col('rate_rn')<=3)

In [67]:
top_three_prediction.count()

264580

In [68]:
top_three_prediction = top_three_prediction.withColumn('trun_rate_n',F.count('rate_index').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [69]:
top_three_prediction = top_three_prediction.withColumn('trun_hotel_revenue',F.sum('rate_revenue').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [70]:
top_three_prediction = top_three_prediction.withColumn('avg_trun_hotel_revenue',F.col('trun_hotel_revenue')/F.col('trun_rate_n'))

In [71]:
top_three_prediction.select('avg_trun_hotel_revenue').show()

+----------------------+
|avg_trun_hotel_revenue|
+----------------------+
|    13.495824833038249|
|    13.495824833038249|
|    11.015897350139992|
|     25.59143799213101|
|     25.59143799213101|
|     25.59143799213101|
|    14.826111837550519|
|    14.826111837550519|
|    14.826111837550519|
|    1.8741063642918618|
|    16.135821634895468|
|    16.135821634895468|
|    16.135821634895468|
|     9.329387306835898|
|     9.329387306835898|
|     9.329387306835898|
|     6.100250178435805|
|     6.100250178435805|
|     6.100250178435805|
|     22.98847553859661|
+----------------------+
only showing top 20 rows

In [126]:
sample_predictions =top_three_prediction.\
select("message_id","hotel_id","check_in_date","check_out_date",'rate_index','rate_type','prob',
       'rate_revenue','trun_rate_n','trun_hotel_revenue','avg_trun_hotel_revenue',)

In [127]:
%%spark -o sample_predictions

In [82]:
%%local
fig4 = plt.figure(figsize=(7,7))
sample_predictions = sample_predictions.sort_values(by=['trun_rate_n'])
fig4 = sns.scatterplot(x='trun_rate_n', y='trun_hotel_revenue',data=sample_predictions)
fig4.set(xlabel='trun_rate_n', ylabel='trun_hotel_revenue')
fig4.set_ylim(bottom=0,top=100)
plt.tight_layout()
plt.savefig('image/trun_hotel_revenue_by_n.png')

In [66]:
%%local
sample_predictions['avg_trun_hotel_revenue'].head(50)

1084   10.12112
2226   13.11251
1305   18.23883
23     18.74732
286     7.33658
215    16.84115
287     4.03053
27      8.02256
756     4.10109
2289    7.18578
1042   15.21596
637    10.05006
2154    9.91894
49      9.12699
2194    8.75786
949    16.40713
1200   20.71793
103    29.96671
279     7.66790
1197    4.56407
686    20.52672
701     3.11842
767    12.13597
1875   14.10611
1539   35.33625
1775    7.05864
1771    8.18157
2201   11.84151
202    15.06691
1785   10.61459
1401    9.35314
1869   12.60943
297    20.35567
246     8.21420
1068    8.74658
1097    9.62761
7      17.53554
1833   16.24088
735    14.94201
1849   18.70016
107     8.21524
2236   16.85825
1701   15.51707
2184   11.89425
1005   16.14439
1560   15.85690
347    31.86053
62     20.42924
996     5.10500
725    11.89969
Name: avg_trun_hotel_revenue, dtype: float64

In [128]:
%%local
fig4 = plt.figure(figsize=(7,7))
data = sample_predictions['avg_trun_hotel_revenue']
fig4 = sns.distplot(data,hist=True)
fig4.set(xlabel='avg_trun_hotel_revenue', ylabel='distribution')
fig4.set_xlim(0,100)
plt.tight_layout()
plt.savefig('image/avg_trun_hotel_revenue_top3.png')

## Take top seven rates

In [72]:
top_seven_prediction = prediction.withColumn('rate_rn',row_number().over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date","tuid").orderBy(desc('prob')))).\
filter(F.col('rate_rn')<=7)

In [73]:
top_seven_prediction = top_seven_prediction.withColumn('trun_rate_n',F.count('rate_index').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [74]:
top_seven_prediction  = top_seven_prediction.withColumn('trun_hotel_revenue',F.sum('rate_revenue').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [75]:
top_seven_prediction  = top_seven_prediction .withColumn('avg_trun_hotel_revenue',F.col('trun_hotel_revenue')/F.col('trun_rate_n'))

In [88]:
sample_predictions =top_seven_prediction .\
select("message_id","hotel_id","check_in_date","check_out_date",'rate_index','rate_type','prob',
       'rate_revenue','trun_rate_n','trun_hotel_revenue','avg_trun_hotel_revenue')

In [76]:
top_three_prediction = top_three_prediction.withColumnRenamed('avg_trun_hotel_revenue','avg_trun_hotel_revenue_top3')

In [77]:
top_seven_prediction = top_seven_prediction.withColumnRenamed('avg_trun_hotel_revenue','avg_trun_hotel_revenue_top7')

In [84]:
sample_predictions = prediction.select("message_id","hotel_id","check_in_date","check_out_date","tuid",'avg_hotel_revenue','normalized_hotel_revenue').\
join(top_three_prediction .\
select("message_id","hotel_id","check_in_date","check_out_date","tuid",'avg_trun_hotel_revenue_top3'),
    ["message_id","hotel_id","check_in_date","check_out_date","tuid"],how='left')

In [85]:
sample_predictions = sample_predictions.join(top_seven_prediction .\
select("message_id","hotel_id","check_in_date","check_out_date","tuid",'avg_trun_hotel_revenue_top7'),
    ["message_id","hotel_id","check_in_date","check_out_date","tuid"],how='left')

In [86]:
sample_predictions.count()

10927480

In [87]:
sample_predictions = sample_predictions.sample(False,0.1,seed=917)

In [88]:
%%spark -o sample_predictions

In [89]:
%%local
fig4 = plt.figure(figsize=(7,7))
data = sample_predictions['avg_trun_hotel_revenue']
fig4 = sns.distplot(data,hist=True)
fig4.set(xlabel='avg_trun_hotel_revenue', ylabel='distribution')
#fig4.set_ylim(bottom=0,top=10)
plt.tight_layout()
plt.savefig('image/avg_trun_hotel_revenue_top7.png')

In [89]:
%%local
fig4 = plt.figure(figsize=(7,7))
fig4 = sns.distplot(sample_predictions['avg_trun_hotel_revenue_top3'],color="skyblue", hist=False,label="top 3")
fig4 = sns.distplot(sample_predictions['avg_trun_hotel_revenue_top7'],color="steelblue",  hist=False,label="top 7")
fig4 = sns.distplot(sample_predictions['avg_hotel_revenue'],color="red", hist=False, label="all")
fig4 = sns.distplot(sample_predictions['normalized_hotel_revenue'],color="green",  hist=False,label="normalized")
fig4.set(xlabel='hotel_revenue', ylabel='distribution')
fig4.set_xlim(0,100)
plt.legend()
plt.tight_layout()
plt.savefig('image/avg_trun_hotel_revenue.png')

## Normalization

In [80]:
prediction = prediction.withColumn('sum_prob',F.sum('prob').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [81]:
prediction = prediction.withColumn('normalized_prob',F.col('prob')/F.col('sum_prob'))

In [82]:
prediction = prediction.withColumn('normalized_rate_revenue',F.col('normalized_prob')*F.col('src_supply_revenue_usd'))

In [83]:
prediction = prediction.withColumn('normalized_hotel_revenue',F.sum('normalized_rate_revenue').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [142]:
sample_predictions =prediction .\
select("message_id","hotel_id","check_in_date","check_out_date",'normalized_hotel_revenue')

In [143]:
sample_predictions = sample_predictions.sample(False,0.1,seed=917)

In [144]:
%%spark -o sample_predictions

In [145]:
%%local
fig4 = plt.figure(figsize=(7,7))
fig4 = sns.distplot(sample_predictions['normalized_hotel_revenue'],color="green", label="normalized_hotel_revenue")
fig4.set(xlabel='normalized_hotel_revenue', ylabel='distribution')
fig4.set_xlim(0,100)
plt.legend()
plt.tight_layout()
plt.savefig('image/normalized_hotel_revenue.png')





## Addtional revenue estimation on last search lead to a booking

### read last search data

In [44]:
#Oct
#file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-22-2020_last_search_df.csv"

#Nov
file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-23-2020_last_search_score_df.csv"

#Dec
#file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-01-2020_last_search_score_df.csv"

last_search_df = sqlContext.read.format('csv').\
            options(header='True', inferSchema='True', delimiter=',').\
            load(file_loc)



last_search_df = last_search_df.\
            filter(F.col("hotel_id")>0).\
            filter(F.col('src_rate_amount_usd').isNotNull())
print(last_search_df.count())

Collecting search the data...
3814935

In [45]:
last_search_df.printSchema()

root
 |-- message_id: string (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: timestamp (nullable = true)
 |-- check_out_date: timestamp (nullable = true)
 |-- hotel_index: integer (nullable = true)
 |-- bk_hotel_index: integer (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: boolean (nullable = true)
 |--

### transform data

In [46]:
data_df = last_search_df.select('hotel_id', 'check_in_date', 'check_out_date', 'tuid', 'rate_type', 'message_id', 'hotel_index','hotel_result_index', 'rate_index', 'message_date', 'score_1', 'city', 'star_rating', 'filter_want_in_policy_rates_only', 'filter_eligible_for_loyalty', 'filter_free_breakfast', 'filter_free_wifi', 'filter_free_parking', 'eligible_for_loyalty', 'free_breakfast', 'free_wifi', 'free_parking', 'refundable', 'bk_hotel_index', 'src_rate_amount_usd', 'src_commission_base_usd', 'src_supply_revenue_usd')
                         

In [47]:
data_df = data_df.withColumn('rate_n',F.count('rate_index').\
                                                  over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [48]:
data_df = data_df.withColumn('row_index', row_number().over(Window.orderBy(F.monotonically_increasing_id())))

In [49]:
data_df.show()

+--------+-------------------+-------------------+--------+---------+--------------------+-----------+------------------+----------+--------------------+-------------------+--------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+--------------+-------------------+-----------------------+----------------------+------+---------+
|hotel_id|      check_in_date|     check_out_date|    tuid|rate_type|          message_id|hotel_index|hotel_result_index|rate_index|        message_date|            score_1|          city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|bk_hotel_index|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|rate_n|row_index|
+--------+----------

In [50]:
data_df = data_df.\
             withColumn("rate_index", data_df.rate_index.cast("integer")).\
            withColumn("star_rating", data_df.star_rating.cast("float")).\
            withColumn("eligible_for_loyalty", data_df.eligible_for_loyalty.cast("boolean").cast("integer")).\
            withColumn("free_breakfast", data_df.free_breakfast.cast("boolean").cast("integer")).\
            withColumn("free_wifi", data_df.free_wifi.cast("boolean").cast("integer")).\
            withColumn("free_parking", data_df.free_parking.cast("boolean").cast("integer")).\
            withColumn("refundable", data_df.refundable.cast("boolean").cast("integer"))

In [51]:
data_df.count()

3814935

In [52]:
imputer = Imputer(inputCols=['star_rating'],
outputCols=['star_rating'])
data_df = imputer.fit(data_df).transform(data_df)

In [53]:
data_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in data_df.columns]).show()

+--------+-------------+--------------+----+---------+----------+-----------+------------------+----------+------------+-------+----+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+--------------+-------------------+-----------------------+----------------------+------+---------+
|hotel_id|check_in_date|check_out_date|tuid|rate_type|message_id|hotel_index|hotel_result_index|rate_index|message_date|score_1|city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|bk_hotel_index|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|rate_n|row_index|
+--------+-------------+--------------+----+---------+----------+-----------+------------------+----------+------------+-------+--

### pip data into model

In [54]:
pipelineModel = pipeline.fit(data_df)
data_tm_df = pipelineModel.transform(data_df)

In [55]:
prediction=rfCvModel.transform(data_tm_df)

In [56]:
prediction.count()

3814935

In [57]:
unlist = F.udf(lambda x: float(list(x)[0]), DoubleType())
prediction = prediction.withColumn('prob',unlist('probability'))

In [58]:
prediction = prediction.\
withColumn('rate_revenue',F.col('src_supply_revenue_usd')*F.col('prob'))

In [59]:
prediction_output = prediction.drop('_features','features','rawPrediction','probability')

In [60]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'
datestamp = datetime.datetime.now().strftime('%m-%d-%Y')
prediction_output.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_prediction.csv',header = 'true')

In [63]:
#Oct
#file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-22-2020_prediction.csv"

#Nov
file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-23-2020_prediction.csv"

#Dec
#file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-16-2020_prediction.csv"
prediction = sqlContext.read.format('csv').\
            options(header='True', inferSchema='True', delimiter=',').\
            load(file_loc)


prediction = prediction.\
            filter(F.col("hotel_id")>0).\
            filter(F.col('src_rate_amount_usd').isNotNull())
print(prediction.count())

3814935

In [64]:
prediction.dropDuplicates().count()

3814935

In [62]:
prediction.show()

+--------+-------------------+-------------------+--------+---------+--------------------+-----------+------------------+----------+--------------------+-------------------+--------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+--------------+-------------------+-----------------------+----------------------+------+---------+--------------------+--------------------+--------------------+--------------------+----------+------------------+------------------+
|hotel_id|      check_in_date|     check_out_date|    tuid|rate_type|          message_id|hotel_index|hotel_result_index|rate_index|        message_date|            score_1|          city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parkin

### top seven hotel revenue

In [65]:
top_seven_prediction = prediction.withColumn('rate_rn',row_number().over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid').orderBy(desc('prob')))).\
filter(F.col('rate_rn')<=7)

In [66]:
top_seven_prediction.count()

2350065

In [67]:
top_seven_prediction = top_seven_prediction.withColumn('trun_rate_n',F.count('rate_index').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [68]:
top_seven_prediction = top_seven_prediction.withColumn('trun_hotel_revenue',F.sum('rate_revenue').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [69]:
top_seven_prediction = top_seven_prediction.withColumn('avg_trun_hotel_revenue',F.col('trun_hotel_revenue')/F.col('trun_rate_n'))

### normalized hotel revenue

In [70]:
prediction = prediction.withColumn('sum_prob',F.sum('prob').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [71]:
prediction = prediction.withColumn('normalized_prob',F.col('prob')/F.col('sum_prob'))

In [72]:
prediction = prediction.withColumn('normalized_rate_revenue',F.col('normalized_prob')*F.col('src_supply_revenue_usd'))

In [73]:
prediction = prediction.withColumn('normalized_hotel_revenue',F.sum('normalized_rate_revenue').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [74]:
prediction.select("message_id","hotel_id","check_in_date","check_out_date","tuid",'normalized_hotel_revenue','hotel_index','score_1','bk_hotel_index').\
dropDuplicates().count()

414259

In [98]:
revenue_prediction = prediction.select("message_id","hotel_id","check_in_date","check_out_date","tuid",'normalized_hotel_revenue','hotel_index','score_1','bk_hotel_index').\
join(top_seven_prediction .\
select("message_id","hotel_id","check_in_date","check_out_date","tuid",'avg_trun_hotel_revenue','hotel_index','score_1','bk_hotel_index'),
    ["message_id","hotel_id","check_in_date","check_out_date","tuid",'hotel_index','score_1','bk_hotel_index'],how='left').dropDuplicates()

In [99]:
revenue_prediction.count()

938873

In [100]:
revenue_prediction.filter(F.col('bk_hotel_index')==F.col('hotel_index')).count()

257042

In [101]:
revenue_prediction = revenue_prediction.filter(F.col('score_1').isNotNull())

### Bands

In [102]:
band_df = revenue_prediction.\
    groupby("message_id", "tuid").\
    agg(F.count("hotel_id").alias("n"),F.max("score_1").alias("ub"),F.min("score_1").alias("lb")).\
    filter(F.col("n")>=1).\
    filter(F.col("n")<=30).\
    withColumn("n_band", F.lit(5)).\
    withColumn("sz_band", F.expr("(ub - lb)/n_band")).\
    select("message_id", "tuid", "sz_band", "ub", "lb")

In [103]:
revenue_prediction2 = revenue_prediction.\
    join(band_df, ["message_id", "tuid"]).\
    withColumn("band", F.when(F.col("sz_band") == 0, 1).otherwise(F.expr("int((score_1-lb)/sz_band)")))

In [104]:
revenue_prediction2.groupby("band").agg(F.count("*").alias("n")).show(10)

+----+------+
|band|     n|
+----+------+
|   1|115277|
|   3| 79754|
|   5|182478|
|   4| 79811|
|   2| 79355|
|   0|288001|
+----+------+

In [105]:
revenue_prediction2.count()

824676

In [106]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).count()

197908

## New revenue with top 7

In [84]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('avg_trun_hotel_revenue','normalized_hotel_revenue').\
agg(F.sum('avg_trun_hotel_revenue'),F.sum('normalized_hotel_revenue')).show()

name 'f' is not defined
Traceback (most recent call last):
NameError: name 'f' is not defined



In [85]:
revenue_prediction2 = revenue_prediction2.\
 withColumn("band_index_top7", row_number().over(Window.partitionBy("message_id", "tuid","band").orderBy(F.desc('avg_trun_hotel_revenue')))).\
withColumn("new_hotel_index1", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                orderBy(F.asc("band"),F.asc("band_index_top7"))))

In [86]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index1')).\
select('avg_trun_hotel_revenue','normalized_hotel_revenue').\
agg(F.sum('avg_trun_hotel_revenue'),F.sum('normalized_hotel_revenue')).show()

+---------------------------+-----------------------------+
|sum(avg_trun_hotel_revenue)|sum(normalized_hotel_revenue)|
+---------------------------+-----------------------------+
|         1969910.5313579082|            2533349.576271525|
+---------------------------+-----------------------------+

In [87]:
revenue_prediction2.filter(F.col('hotel_index')!=F.col('new_hotel_index1')).count()

316712

## Normalized revenue

In [88]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|           2255355.0818443066|
+-----------------------------+

In [89]:
revenue_prediction2 = revenue_prediction2.\
 withColumn("band_index_normalized", row_number().over(Window.partitionBy("message_id", "tuid","band").orderBy(F.desc('normalized_hotel_revenue')))).\
withColumn("new_hotel_index2", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                orderBy(F.asc("band"),F.asc("band_index_normalized"))))

In [90]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index2')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|            2538956.009340824|
+-----------------------------+

In [91]:
revenue_prediction2.filter(F.col('hotel_index')!=F.col('new_hotel_index2')).count()

316655

In [92]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'
datestamp = datetime.datetime.now().strftime('%m-%d-%Y')
revenue_prediction2.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_revenue_prediction.csv',header = 'true')

## Compared by rate type