In [1]:
%load_ext sparkmagic.magics
from dsx_core_utils import proxy_util,dsxhi_util
proxy_util.configure_proxy_livy() 
dsxhi_util.list_livy_endpoints()

success configuring sparkmagic livy.
['https://qlawsbidlhe02a.ad.datalake.foc.zone:8445/gateway/dsx/livy2/v1']


In [2]:
%%spark config
{"executorCores": 2, "numExecutors": 10, "executorMemory": "15g", 
 "driverMemory": "12g", "proxyUser": "aliu-", "driverCores": 1, 
 "conf": {"spark.yarn.appMasterEnv.THEANO_FLAGS": "base_compiledir=${PWD}/.theano"}}

In [3]:
%spark add -s inactivePurchaseLeadGen -k -l python -u https://qlawsbidlhe02a.ad.datalake.foc.zone:8445/gateway/dsx/livy2/v1

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
20924,application_1582863878751_118416,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [4]:
%%spark

import pyspark
import os, sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import lower
import pyspark.sql.functions as F
from pyspark.sql.functions import last_day, rand, dense_rank, last_day, col, size, length, when, upper, unix_timestamp, avg, substring, lower, udf, sum, count, lit, mean, concat, countDistinct, desc, from_unixtime, row_number, year, month, to_date, upper, months_between
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.types import *
from pyspark.sql.window import Window
from functools import reduce

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.feature import VectorIndexer, VectorAssembler, StringIndexer, QuantileDiscretizer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from os.path import expanduser, join, abspath
import time
import pandas as pd

spark = SparkSession.builder.getOrCreate()

In [25]:
%%spark

model_save_path = '/dev/projects/retention_models/purchase_payoff/models/'
training_data_path = '/dev/projects/retention_models/purchase_payoff/training/data/train/'
test_data_path = '/dev/projects/retention_models/purchase_payoff/training/data/test/'
result_path = '/dev/projects/retention_models/purchase_payoff/training/results/mover_6/'

monthly_prep_path = '/dev/projects/retention_models/monthly_snapshot/monthly_preprocessed/'
actual_path = '/dev/projects/retention_models/actual_value/'

# 1. Load Preprocess Data

### Data for Mover Purchase V1.1 3-6

In [6]:
%%spark

df201612 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201612.parquet').drop('gcid')
df201703 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201703.parquet').drop('gcid')
df201706 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201706.parquet').drop('gcid')
df201709 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201709.parquet').drop('gcid')
df201712 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201712.parquet').drop('gcid')
df201803 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201803.parquet').drop('gcid')
df201806 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201806.parquet').drop('gcid')
df201809 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201809.parquet').drop('gcid')
df201812 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201812.parquet')
df201903 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201903.parquet')
df201906 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201906.parquet')
df201909 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201909.parquet')
df201912 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_201912.parquet')
df202003 = spark.read.parquet(monthly_prep_path + 'monthly_preprocessed_202003.parquet')


df_preprocessed = df201612.union(df201703).union(df201706).union(df201709).union(df201712)\
                            .union(df201803).union(df201806).union(df201809).union(df201812)\
                            .union(df201903).union(df201906).union(df201909).union(df201912)\
                            .union(df202003)\
                            .drop('borrowerage_bucket','p_married_exp','p_31_50_exp')

In [7]:
%%spark

df_preprocessed.printSchema()

root
 |-- servicecalendardate: date (nullable = true)
 |-- ln_no: string (nullable = true)
 |-- og_note_dt: timestamp (nullable = true)
 |-- ct_age_exp: integer (nullable = true)
 |-- ct_1_exp: string (nullable = true)
 |-- ct_2_exp: string (nullable = true)
 |-- ct_3_exp: string (nullable = true)
 |-- home_value_exp: double (nullable = true)
 |-- p_edu_hs_exp: integer (nullable = true)
 |-- personnum_per_room_exp: double (nullable = true)
 |-- mosaic_group_refi_exp: string (nullable = true)
 |-- mosaic_group_pur_exp: string (nullable = true)
 |-- ratespread_min_exp: double (nullable = true)
 |-- ratespread_min_pur_exp: double (nullable = true)
 |-- ln_ann_int_rt: double (nullable = true)
 |-- loantypedescription_exp: string (nullable = true)
 |-- loanamortizationtype: string (nullable = true)
 |-- ageinmon_exp: double (nullable = true)
 |-- og_mtg_am_exp: double (nullable = true)
 |-- currentcltv_exp: double (nullable = true)
 |-- orig_fico_exp: integer (nullable = true)
 |-- LiveYear

### Actual Payoff Data

In [8]:
%%spark

actual = spark.read.parquet(actual_path + 'Actual_payoff_20200505.parquet')\
                .select('servicecalendardate', 'ln_no', 'purchasepayoff_3_6')

actual.show(1)

+-------------------+----------+------------------+
|servicecalendardate|     ln_no|purchasepayoff_3_6|
+-------------------+----------+------------------+
|         2016-10-31|1232492399|               0.0|
+-------------------+----------+------------------+
only showing top 1 row

### Join actual data with preprocessed data

In [9]:
%%spark

target = 'purchasepayoff_3_6'

In [10]:
%%spark

df_all = df_preprocessed.join(actual, on=['servicecalendardate', 'ln_no'], how='inner').dropDuplicates()

In [11]:
%%spark

df_all.groupby('servicecalendardate').agg(count('ln_no'), sum(target)).orderBy(col('servicecalendardate').desc()).show()

+-------------------+------------+-----------------------+
|servicecalendardate|count(ln_no)|sum(purchasepayoff_3_6)|
+-------------------+------------+-----------------------+
|         2020-03-31|     1867297|                    0.0|
|         2019-12-31|     1802243|                   44.0|
|         2019-09-30|     1763391|                11183.0|
|         2019-06-30|     1822001|                13684.0|
|         2018-12-31|     1726017|                21914.0|
|         2018-09-30|     1683583|                14008.0|
|         2018-06-30|     1619695|                12303.0|
|         2018-03-31|     1577779|                17018.0|
|         2017-12-31|     1530004|                19389.0|
|         2017-09-30|     1468441|                12459.0|
|         2017-06-30|     1423608|                11440.0|
|         2017-03-31|     1379094|                14992.0|
|         2016-12-31|     1326375|                16483.0|
+-------------------+------------+----------------------

# 3. Build Model

### Under-sample negative values

In [12]:
%%spark

# create fundtion for undersampling
def data_split(df, train_months, target, rate):
    
    ## Split Train/Validation From Test ## 
    train_validate = df.where(col('servicecalendardate').isin(train_months))
    
    ## Split Out Payoffs into Train/Validate ##
    train_refi_all = train_validate.filter(train_validate[target] == '1.')
    train_refi, validate_refi = train_refi_all.randomSplit([0.7, 0.3], seed=123)

    ## Split Out Non-Payoffs to Train/Validate
    train_nonrefi_all = train_validate.filter(train_validate[target] == '0.')
    train_nonrefi, validate_nonrefi = train_nonrefi_all.randomSplit([0.7, 0.3], seed=123)

    ## Undersample Non-Payoffs ## 
    train_non, notused1 = train_nonrefi.randomSplit([rate, 1-rate], seed=123)
    val_non, notused2 = validate_nonrefi.randomSplit([rate, 1-rate], seed=123)

    ## Create Final Training/Validate Sets
    train = train_refi.unionAll(train_non)
    validate = validate_refi.unionAll(val_non)
    train.cache()
    
    return train, validate

In [13]:
%%spark

train_months = ['2017-09-30', '2017-12-31', '2018-03-31', '2018-06-30', '2018-09-30', '2018-12-31']

df_train, df_validate = data_split(df_all, train_months, target, 0.025)

In [14]:
%%spark

df_train.groupby(['servicecalendardate', target]).count().orderBy('servicecalendardate', target).show()

+-------------------+------------------+-----+
|servicecalendardate|purchasepayoff_3_6|count|
+-------------------+------------------+-----+
|         2017-09-30|               0.0|25381|
|         2017-09-30|               1.0| 8723|
|         2017-12-31|               0.0|26400|
|         2017-12-31|               1.0|13637|
|         2018-03-31|               0.0|27208|
|         2018-03-31|               1.0|11930|
|         2018-06-30|               0.0|28176|
|         2018-06-30|               1.0| 8680|
|         2018-09-30|               0.0|29555|
|         2018-09-30|               1.0| 9811|
|         2018-12-31|               0.0|29972|
|         2018-12-31|               1.0|15293|
+-------------------+------------------+-----+

### Create Pipeline

In [15]:
%%spark

def create_pipeline(target, *arg):
    
    exclude_cols = ('servicecalendardate', 'ln_no', 'og_note_dt', 'mosaic_group_pur_exp', 'ratespread_min_pur_exp',
                'ln_purpose_type', 'investornameshort', 'ln_ann_int_rt', target) 
    cat_cols = [i[0] for i in df_train.dtypes if ((i[1]=='string') & (~i[0].endswith(exclude_cols)))]
    num_cols = [i[0] for i in df_train.dtypes if ((i[1].startswith(('int', 'double'))) & (~i[0].endswith(exclude_cols)))]
    
    stages = []
    
    for col in cat_cols:
        
        #Category indexing with StringIndexer
        indexer = StringIndexer(inputCol = col, outputCol = col+'_idx').setHandleInvalid('keep')
        stages += [indexer]
        
    #assemblerInputs = [c+'_vec' for c in cat_cols] + num_cols
    assemblerInputs = [c+'_idx' for c in cat_cols] + num_cols
    assembler = VectorAssembler(inputCols = assemblerInputs, outputCol = 'vectFeatures')
    
    stages += [assembler]
    
    lr = LogisticRegression(maxIter=100, regParam=0.1, elasticNetParam=0.0, fitIntercept = True,
                            featuresCol='vectFeatures', labelCol=target)
    rf = RandomForestClassifier(numTrees=250, maxDepth = 5, featuresCol='vectFeatures', labelCol=target)
    gbt = GBTClassifier(maxIter=100, featuresCol='vectFeatures', labelCol=target)

    pipeline_lr = Pipeline(stages = stages + [lr])
    pipeline_rf = Pipeline(stages = stages + [rf])
    pipeline_gbt = Pipeline(stages = stages + [gbt])
 
    if "lr" in arg and "rf" in arg and "gbt" in arg:
        return lr, rf, gbt, pipeline_lr, pipeline_rf, pipeline_gbt
    elif "lr" in arg and "rf" in arg:
        return lr, rf, pipeline_lr, pipeline_rf
    elif "lr" in arg and "gbt" in arg:
        return lr, gbt, pipeline_lr, pipeline_gbt 
    elif "rf" in arg and "gbt" in arg:
        return rf, gbt, pipeline_rf, pipeline_gbt
    elif "rf" in arg:
        return rf, pipeline_rf
    elif "lr" in arg:
        return lr, pipeline_lr
    elif "gbt" in arg:
        return gbt, pipeline_gbt
    else:
        return gbt, pipeline_gbt

In [16]:
%%spark

def fit_pipeline(pipeline, training_dataset):
    model_pipeline = pipeline.fit(training_dataset)
    return model_pipeline
    
def persist_modelpersist_(pipeline_model, model_name, model_save_path=model_save_path):
    pipeline_model.save(model_save_path + model_name)

In [17]:
%%spark

lr, rf, gbt, pipeline_lr, pipeline_rf, pipeline_gbt = create_pipeline(target, "lr", "rf", "gbt")

In [18]:
%%spark

# Logistic Regression
start = time.time()
model_pipeline_lr = fit_pipeline(pipeline_lr, df_train)
end = time.time()

print('Logistic Regression Training Time:', end - start)

Logistic Regression Training Time: 200.39102458953857

In [19]:
%%spark

# Random Forest
start = time.time()
model_pipeline_rf = fit_pipeline(pipeline_rf, df_train)
end = time.time()

print('Random Forest Training Time:', end - start)

Random Forest Training Time: 209.19338965415955

In [20]:
%%spark

# Gradient Boosting
start = time.time()
model_pipeline_gbt = fit_pipeline(pipeline_gbt, df_train)
end = time.time()

print('Gradient Boosting Training Time:', end - start)

Gradient Boosting Training Time: 3026.6029171943665

### Save Models

In [21]:
%%spark

persist_modelpersist_(model_pipeline_lr, 'Mover_3to6_V1_1_LR_20200515')
persist_modelpersist_(model_pipeline_rf, 'Mover_3to6_V1_1_RF_20200515')
persist_modelpersist_(model_pipeline_gbt, 'Mover_3to6_V1_1_GBT_20200515')

### Read Models

In [22]:
%%spark

model_pipeline_lr = PipelineModel.load(path = model_save_path+'Mover_3to6_V1_1_LR_20200515')
model_pipeline_rf = PipelineModel.load(path = model_save_path+'Mover_3to6_V1_1_RF_20200515')
model_pipeline_gbt = PipelineModel.load(path = model_save_path+'Mover_3to6_V1_1_GBT_20200515')

# 3. Variable Importance

In [23]:
%%spark
def extractFeatureImp(model_pipeline, df_train, featuresCol):
    
    featureImp = model_pipeline.stages[-1].featureImportances
    transformed = model_pipeline.transform(df_train)
    list_extract = []
    
    for i in transformed.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + transformed.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
        
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    
    return(varlist.sort_values('score', ascending = False))
    return varlist

In [24]:
%%spark

gbt_feature_importance = extractFeatureImp(model_pipeline_gbt, df_train, "vectFeatures")

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(gbt_feature_importance)

    idx                         name  \
5    13                 ageinmon_exp   
7    15              currentcltv_exp   
10   18           LiveYears_long_exp   
4    12           ratespread_min_exp   
8    16                orig_fico_exp   
6    14                og_mtg_am_exp   
1     9               home_value_exp   
17    3    mosaic_group_refi_exp_idx   
13   21                    ln_tr_exp   
20    6      ln_purpose_type_exp_idx   
0     8                   ct_age_exp   
9    17          LiveYears_short_exp   
3    11       personnum_per_room_exp   
21    7  og_occupy_stat_type_exp_idx   
19    5     loanamortizationtype_idx   
2    10                 p_edu_hs_exp   
18    4  loantypedescription_exp_idx   
16    2                 ct_3_exp_idx   
12   20         issingleborrower_exp   
15    1                 ct_2_exp_idx   
14    0                 ct_1_exp_idx   
11   19            LiveYears_grp_exp   

                                                 vals     score  
5            

# 4. ROC

### Predictions

In [26]:
%%spark

def predict(model_pipeline, data):
    
    _scoreUdf = udf(lambda v: float(v[1]), DoubleType())
    
    prediction = model_pipeline.transform(data)
    pred_df = prediction.withColumn('pred', _scoreUdf(prediction['probability']))
    return pred_df

### ROC

In [27]:
%%spark

def roc_auc(df, target):   
    
    pred_df_lr = predict(model_pipeline_lr, df)
    pred_df_rf = predict(model_pipeline_rf, df)
    pred_df_gbt = predict(model_pipeline_gbt, df)

    ### Calculate ROC ###  
    evalPred_lr = pred_df_lr.select(target, 'rawPrediction', 'prediction', 'probability')\
                                .withColumnRenamed(target, 'label')
    evalPred_rf = pred_df_rf.select(target, 'rawPrediction', 'prediction', 'probability')\
                                .withColumnRenamed(target, 'label')
    evalPred_gbt = pred_df_gbt.select(target, 'rawPrediction', 'prediction', 'probability')\
                                .withColumnRenamed(target, 'label')

    evaluatorLR = BinaryClassificationEvaluator()
    evaluatorRF = BinaryClassificationEvaluator()
    evaluatorGBT = BinaryClassificationEvaluator()

    print("Test Area Under ROC - LR: " + str(evaluatorLR.evaluate(evalPred_lr, {evaluatorLR.metricName: "areaUnderROC"})))        
    print("Test Area Under ROC - RF: " + str(evaluatorRF.evaluate(evalPred_rf, {evaluatorRF.metricName: "areaUnderROC"})))        
    print("Test Area Under ROC - GBT: " + str(evaluatorGBT.evaluate(evalPred_gbt, {evaluatorGBT.metricName: "areaUnderROC"})))
    print("\n")

In [28]:
%%spark

test_months = ['2017-06-30', '2019-06-30', '2019-09-30']
df_test = df_all.where(col('servicecalendardate').isin(test_months))

for df in [df_train, df_validate, df_test]:
    roc_auc(df, target)

Test Area Under ROC - LR: 0.6108436508396088
Test Area Under ROC - RF: 0.6352566049206047
Test Area Under ROC - GBT: 0.6785597367147729


Test Area Under ROC - LR: 0.6065183020070458
Test Area Under ROC - RF: 0.6287321804683813
Test Area Under ROC - GBT: 0.6548889379759892


Test Area Under ROC - LR: 0.5958793753206999
Test Area Under ROC - RF: 0.6220089997108489
Test Area Under ROC - GBT: 0.6399175834062103

# 5. Capture Rate

In [29]:
%%spark

def deciles(df, model, target):
    
    pred_df = predict(model, df)

    pred_df = QuantileDiscretizer(numBuckets=10, inputCol="pred", outputCol="decile", relativeError=0.00001,
                             handleInvalid="error").fit(pred_df).transform(pred_df)
    pred_df = pred_df.withColumn('decile', (10 - F.col('decile')).cast('int'))
    
    window_cumsum = Window.orderBy('decile').rangeBetween(Window.unboundedPreceding, 0)
    total_target = pred_df.select(F.sum(target)).collect()[0][0]
    df_out = pred_df\
        .groupBy('decile', )\
        .agg(F.count('ln_no').alias('decile_cnt'), F.sum(target).alias('payoff_cnt'))\
        .withColumn('cum_sum', F.sum('payoff_cnt').over(window_cumsum) / total_target)\
        .sort('decile')

    return df_out

In [56]:
%%spark

date_label = test_months[0]
print(date_label)

2017-06-30

In [57]:
%%spark
print(date_label)
deciles(df_all.filter(col('servicecalendardate') == date_label), model_pipeline_lr, target).show()

2017-06-30
+------+----------+----------+-------------------+
|decile|decile_cnt|payoff_cnt|            cum_sum|
+------+----------+----------+-------------------+
|     1|    142344|    2074.0|0.18129370629370628|
|     2|    142374|    1603.0| 0.3214160839160839|
|     3|    142366|    1429.0| 0.4463286713286713|
|     4|    142370|    1206.0| 0.5517482517482517|
|     5|    142326|    1107.0|  0.648513986013986|
|     6|    142378|    1058.0| 0.7409965034965035|
|     7|    142362|     943.0| 0.8234265734265734|
|     8|    142347|     818.0|   0.89493006993007|
|     9|    142377|     679.0| 0.9542832167832168|
|    10|    142364|     523.0|                1.0|
+------+----------+----------+-------------------+

In [58]:
%%spark
print(date_label)
deciles(df_all.filter(col('servicecalendardate') == date_label), model_pipeline_rf, target).show()

2017-06-30
+------+----------+----------+-------------------+
|decile|decile_cnt|payoff_cnt|            cum_sum|
+------+----------+----------+-------------------+
|     1|    142357|    2256.0| 0.1972027972027972|
|     2|    142372|    1739.0| 0.3492132867132867|
|     3|    142359|    1464.0|0.47718531468531467|
|     4|    142333|    1287.0| 0.5896853146853147|
|     5|    142362|    1125.0| 0.6880244755244755|
|     6|    142470|     979.0| 0.7736013986013986|
|     7|    142270|     928.0| 0.8547202797202798|
|     8|    142355|     727.0| 0.9182692307692307|
|     9|    142357|     597.0| 0.9704545454545455|
|    10|    142373|     338.0|                1.0|
+------+----------+----------+-------------------+

In [None]:
%%spark
print(date_label)
deciles(df_all.filter(col('servicecalendardate') == date_label), model_pipeline_gbt, target).show()

# 6. Predictions

In [None]:
%%spark

def fullMonthPrediction(df):
    
    pred_df_lr = predict(model_pipeline_lr, df)
    validationPredictionsLR = pred_df_lr.select('ln_no', 'pred')\
                                            .withColumnRenamed('pred', 'logRegProb')
    
    pred_df_rf = predict(model_pipeline_rf, df)
    validationPredictionsRF = pred_df_rf.select('ln_no', 'pred')\
                                            .withColumnRenamed('pred', 'randForProb')
    
    pred_df_gbt = predict(model_pipeline_gbt, df)
    validationPredictionsGBT = pred_df_gbt.select('ln_no', 'pred')\
                                            .withColumnRenamed('pred', 'gbtProb')
    
    combinedFinalPred = validationPredictionsLR.join(validationPredictionsRF, on='ln_no', how='left')\
                                                .join(validationPredictionsGBT, on='ln_no', how='left')\
                                                .dropDuplicates()
    
    return combinedFinalPred

In [None]:
%%spark

def pred_save(df, servicedate, filename, result_path = result_path):
    
    df1 = df.where(col('servicecalendardate') == servicedate)
    df_pred = fullMonthPrediction(df1)
    
    df_pred.coalesce(1).write.csv(result_path + filename, header=True)

In [None]:
%%spark

def pred_save_newmonth(preprocessed_file, filename, monthly_prep_path=monthly_prep_path, result_path=result_path):
    
    df = spark.read.parquet(monthly_prep_path + preprocessed_file)
    df_pred = fullMonthPrediction(df)
    
    print(df.count())
    df_pred.coalesce(1).write.csv(result_path + filename, header=True)

In [None]:
%%spark

# 2017-6
pred_save(df_all, '2017-06-30', 'pred_MoverV1_1_jun17_20200514.csv')

# 2017-9
pred_save(df_all, '2017-09-30', 'pred_MoverV1_1_sep17_20200514.csv')

# 2017-12
pred_save(df_all, '2017-12-31', 'pred_MoverV1_1_dec17_20200514.csv')

# 2018-3
pred_save(df_all, '2018-03-31', 'pred_MoverV1_1_mar18_20200514.csv')

# 2018-6
pred_save(df_all, '2018-06-30', 'pred_MoverV1_1_jun18_20200514.csv')

# 2018-9
pred_save(df_all, '2018-09-30', 'pred_MoverV1_1_sep18_20200514.csv')

# 2018-12
pred_save(df_all, '2018-12-31', 'pred_MoverV1_1_dec18_20200514.csv')

# 2019-03
pred_save(df_all, '2019-03-31', 'pred_MoverV1_1_mar19_20200514.csv')

# 2019-06
pred_save(df_all, '2019-06-30', 'pred_MoverV1_1_jun19_20200514.csv')

# 2019-09
pred_save(df_all, '2019-09-30', 'pred_MoverV1_1_sep19_20200514.csv')

In [None]:
%%spark

# 2018-1
pred_save_newmonth('monthly_preprocessed_201801.parquet', 'pred_MoverV3_2_jan18_20190722.csv')

# 2018-2
pred_save_newmonth('monthly_preprocessed_201802.parquet', 'pred_MoverV3_2_feb18_20190722.csv')

# 2018-4
pred_save_newmonth('monthly_preprocessed_201804.parquet', 'pred_MoverV3_2_apr18_20190722.csv')

# 2018-5
pred_save_newmonth('monthly_preprocessed_201805.parquet', 'pred_MoverV3_2_may18_20190722.csv')

# 2018-7
pred_save_newmonth('monthly_preprocessed_201807.parquet', 'pred_MoverV3_2_jul18_20190722.csv')

# 2018-8
pred_save_newmonth('monthly_preprocessed_201808.parquet', 'pred_MoverV3_2_aug18_20190722.csv')

# 2018-10
pred_save_newmonth('monthly_preprocessed_201810.parquet', 'pred_MoverV3_2_oct18_20190722.csv')

# 2018-11
pred_save_newmonth('monthly_preprocessed_201811.parquet', 'pred_MoverV3_2_nov18_20190722.csv')

# 2019-1
pred_save_newmonth('monthly_preprocessed_201901.parquet', 'pred_MoverV3_2_jan19_20190722.csv')

# 2019-2
pred_save_newmonth('monthly_preprocessed_201902.parquet', 'pred_MoverV3_2_feb19_20190722.csv')

# 2019-4
pred_save_newmonth('monthly_preprocessed_201904.parquet', 'pred_MoverV3_2_apr19_20190722.csv')

# 2019-5
pred_save_newmonth('monthly_preprocessed_201905.parquet', 'pred_MoverV3_2_may19_20190722.csv')
