In [1]:
#import sys
#sys.path.append('jars/mleap/python')
#import mleap.pyspark
#from mleap.pyspark.spark_support import SimpleSparkSerializer
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler, RFormula, VectorIndexer
from pyspark.ml.regression import GBTRegressor, GBTRegressionModel
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
import json

In [2]:
SparkContext.setSystemProperty('spark.driver.memory', '2g')
SparkContext.setSystemProperty('spark.driver.cores', '3')
SparkContext.setSystemProperty('spark.executor.memory', '2g')
SparkContext.setSystemProperty('spark.executor.cores', '3')
SparkContext.setSystemProperty('spark.driver.memoryOverhead', '1g')
SparkContext.setSystemProperty('spark.storage.memoryFraction', '0.9')

In [3]:
data_df = spark.read.csv('../abt/data/*',header=True, inferSchema=True)
data_df.printSchema()

root
 |-- idmovie: integer (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- director: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- budgetvalue: long (nullable = true)



In [10]:
### GET A NEW ROW AND APPLY AN UNION WITH THE TRAINED DATASET
### IT APPLIES THE FEATURE ENGINEERING

d = {'idmovie':77777,'runtime':90,  'director':'Alfred Hitchcock', 'genres':'Drama','rating':3.5,'budgetvalue':0}#\
#    {'idmovie':88888,'runtime':90,  'director':'Paul Mazursky', 'genres':'Romance','rating':3.5,'budgetvalue':99999999},\
#    {'idmovie':99999,'runtime':100, 'director':'Aldred Hitcock', 'genres':'Fiction','rating':2.4,'budgetvalue':10100111},\


schema = StructType([StructField("idmovie", IntegerType(), True),
                    StructField("runtime", IntegerType(), True),
                    StructField("director", StringType(), True),
                    StructField("genres", StringType(), True),
                    StructField("rating", DoubleType(), True),
                    StructField("budgetvalue", LongType(), True),
                   ])

djson = [json.dumps(d)]
dRDD  = sc.parallelize(djson)

ddf  = spark.read.json(dRDD, schema=schema)

ddf_union = ddf.unionAll(data_df)

gbt_pipeline        = Pipeline.load('../budget_prediction_pipeline/')
gbt_pipeline_loaded = gbt_pipeline.fit(ddf_union)
ddf_features_df     = gbt_pipeline_loaded.transform(ddf_union)

#ddf_features_df1 = ddf_features_df1.filter('idmovie = 888888')#.show(truncate=False)
ddf_features_df


DataFrame[idmovie: int, runtime: int, director: string, genres: string, rating: double, budgetvalue: bigint, director_indexed: double, genres_indexed: double, director_indexed_encoded: vector, genres_indexed_encoded: vector, continuous_features: vector, features: vector]

In [11]:
### LOAD THE MODEL AND PREDICT
gbt_model_load  = GBTRegressionModel.load('../gbt_model/')
gbt_model_pred  = gbt_model_load.transform(ddf_features_df.filter('idmovie = 77777'))

#gbt_model_pred.printSchema()
print(gbt_model_pred.count())
gbt_model_pred.selectExpr('idmovie','director','genres', 'runtime','cast(prediction as Decimal(20,2)) as prediction').show()


1
+-------+----------------+------+-------+----------+
|idmovie|        director|genres|runtime|prediction|
+-------+----------------+------+-------+----------+
|  77777|Alfred Hitchcock| Drama|     90|3686936.58|
+-------+----------------+------+-------+----------+



In [None]:
### MLEAP SERIALIZATION
#gbt_model.serializeToBundle("jar:file:jars/movies_gbt.zip", gbt_model.transform(data_features_df))