In [2]:
#import sys
#sys.path.append('jars/mleap/python')
#import mleap.pyspark
#from mleap.pyspark.spark_support import SimpleSparkSerializer
from pyspark import SparkContext
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.functions import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler, RFormula, VectorIndexer
from pyspark.ml.regression import GBTRegressor, GBTRegressionModel
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors

In [3]:
SparkContext.setSystemProperty('spark.driver.memory', '2g')
SparkContext.setSystemProperty('spark.driver.cores', '3')
SparkContext.setSystemProperty('spark.executor.memory', '2g')
SparkContext.setSystemProperty('spark.executor.cores', '3')
SparkContext.setSystemProperty('spark.driver.memoryOverhead', '1g')
SparkContext.setSystemProperty('spark.storage.memoryFraction', '0.9')

In [4]:
data_df = spark.read.csv('../abt/data/*',header=True, inferSchema=True)
data_df.printSchema()

root
 |-- idmovie: integer (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- director: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- budgetvalue: long (nullable = true)



## Summary Statistics

#### Evaluating continuous features

In [5]:
data_df.summary().select('summary','budgetvalue', 'runtime','rating').show()

+-------+--------------------+------------------+------------------+
|summary|         budgetvalue|           runtime|            rating|
+-------+--------------------+------------------+------------------+
|  count|               18495|             18495|             18495|
|   mean|6.0121800647904836E7|100.06520681265206|3.1008229251148713|
| stddev| 7.384994399567893E8|34.253022733437874|0.6089458576840447|
|    min|                   1|                 0|               0.5|
|    25%|             3000000|                91|              2.77|
|    50%|            11500000|               101|              3.17|
|    75%|            34000000|               116|              3.52|
|    max|         35000000000|               359|               5.0|
+-------+--------------------+------------------+------------------+



#### Evaluating categorical features

In [6]:
data_df.registerTempTable('movie')

In [7]:
### EVALUATING DIRECTORS
dfsql = sqlContext.sql("""select director, count(*) as total, sum(budgetvalue) as budgetval 
                            from (select distinct director, 
                                                  budgetvalue 
                                  from movie) as tmp
                            group by director
                            order by total desc
                      """)
dfsql.show()

+-----------------+-----+----------+
|         director|total| budgetval|
+-----------------+-----+----------+
| Steven Spielberg|   27|1738950000|
|      Woody Allen|   23| 369950000|
|  Martin Scorsese|   20| 783900000|
|       Ron Howard|   19|1153602000|
|     Ridley Scott|   19|1277900000|
|Steven Soderbergh|   19| 489900000|
|   Clint Eastwood|   19| 637300000|
|   John Carpenter|   16| 217235000|
| Alfred Hitchcock|   15|  32405322|
|   Brian De Palma|   15| 333300000|
|       Rob Reiner|   14| 476000000|
|       Wes Craven|   14| 200530000|
|  Joel Schumacher|   13| 533000000|
|  Robert Zemeckis|   13| 988000000|
|        Spike Lee|   13| 214320000|
|   Barry Levinson|   13| 384000000|
|        John Ford|   13|  25226000|
|   Stephen Frears|   13| 285750000|
|     Renny Harlin|   12| 542000000|
|      Walter Hill|   12| 296200000|
+-----------------+-----+----------+
only showing top 20 rows



In [8]:
### EVALUATING GENRES
dfsql = sqlContext.sql("""select genres, count(*) as total, sum(budgetvalue) as budgetval 
                            from movie
                            group by genres
                            order by total desc
                      """)
dfsql.show()

+------------------+-----+------------+
|            genres|total|   budgetval|
+------------------+-----+------------+
|             Drama| 4002|208000414565|
|            Comedy| 2477|106865019824|
|          Thriller| 2182|119472614301|
|            Action| 1739|186063609838|
|           Romance| 1325| 65783279946|
|            Horror| 1143| 24919813634|
|             Crime| 1060| 35068607740|
|         Adventure|  915| 98451049103|
|            Sci-Fi|  784| 52212339263|
|           Fantasy|  543| 71902986518|
|           Mystery|  536| 17964721321|
|          Children|  383| 18381639741|
|               War|  315| 48729442096|
|       Documentary|  230|   539092614|
|           Musical|  220|  8494231643|
|         Animation|  203| 26464556091|
|           Western|  158|  2892246885|
|              IMAX|  120| 15454100000|
|(no genres listed)|  116|  4026313753|
|         Film-Noir|   44|   266624107|
+------------------+-----+------------+



In [9]:
dfsql = sqlContext.sql("""select * from movie where director is not null""")
dfsql.count()
dfsql.show()

+-------+-------+--------------------+---------+------+-----------+
|idmovie|runtime|            director|   genres|rating|budgetvalue|
+-------+-------+--------------------+---------+------+-----------+
|     26|    123|       Oliver Parker|    Drama|  3.61|   11000000|
|    474|    128|   Wolfgang Petersen|   Action|  3.72|   40000000|
|    474|    128|   Wolfgang Petersen| Thriller|  3.72|   40000000|
|   1677|    107|        Sidney Lumet|   Comedy|  2.61|   12000000|
|   1677|    107|        Sidney Lumet|    Drama|  2.61|   12000000|
|   1806|     91|        John Roberts|Adventure|  2.83|   23000000|
|   1806|     91|        John Roberts| Children|  2.83|   23000000|
|   1806|     91|        John Roberts|   Comedy|  2.83|   23000000|
|   2529|    112|Franklin J. Schaf...|   Action|   3.6|    5800000|
|   2529|    112|Franklin J. Schaf...|    Drama|   3.6|    5800000|
|   2529|    112|Franklin J. Schaf...|   Sci-Fi|   3.6|    5800000|
|   2927|     86|          David Lean|    Drama|

#### Defining Features

In [26]:
#'runtime', 'rating','budgetvalue'
continuous_cols  = ['rating', 'budgetvalue']
categorical_cols = ['director','genres']

In [27]:
def featuresCreation(df):

    categorical_indexers = [
        StringIndexer(inputCol=column, outputCol="{0}_indexed".format(column), handleInvalid='keep')
        for column in categorical_cols
    ]

    categorical_features = [
        OneHotEncoder(
            inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol()))
        for indexer in categorical_indexers
    ]

    continuous_feature = [VectorAssembler(
        inputCols=[column for column in continuous_cols],
        outputCol="continuous_features"
    )]

    #continuous_feature_standard = [StandardScaler(inputCol='continuous_features', outputCol='scaled_continuous_features')]

    #all_features = continuous_feature_standard + categorical_features
    all_features = continuous_feature + categorical_features

    universal_assembler = VectorAssembler(
        inputCols=[feature.getOutputCol() for feature in all_features],
        outputCol="features"
    )

    #estimator = categorical_indexers + \
    #            categorical_features + \
    #            continuous_feature + \
    #            continuous_feature_standard + \
    #            [universal_assembler]

    estimator = categorical_indexers + \
                categorical_features + \
                continuous_feature + \
                [universal_assembler]
    
    universal_pipeline = Pipeline(stages=estimator)
    #universal_pipeline.save('budget_prediction_pipeline/')
    data_features_df   = universal_pipeline.fit(df).transform(df)
    
    return data_features_df


In [28]:
data_features_df = featuresCreation(data_df)

In [38]:
data_features_df.show()

+-------+-------+--------------------+---------+------+-----------+----------------+--------------+------------------------+----------------------+-------------------+--------------------+
|idmovie|runtime|            director|   genres|rating|budgetvalue|director_indexed|genres_indexed|director_indexed_encoded|genres_indexed_encoded|continuous_features|            features|
+-------+-------+--------------------+---------+------+-----------+----------------+--------------+------------------------+----------------------+-------------------+--------------------+
|     26|    123|       Oliver Parker|    Drama|  3.61|   11000000|           407.0|           0.0|      (4413,[407],[1.0])|        (20,[0],[1.0])|       [3.61,1.1E7]|(4435,[0,1,409,44...|
|    474|    128|   Wolfgang Petersen|   Action|  3.72|   40000000|            47.0|           3.0|       (4413,[47],[1.0])|        (20,[3],[1.0])|       [3.72,4.0E7]|(4435,[0,1,49,441...|
|    474|    128|   Wolfgang Petersen| Thriller|  3.72|

In [29]:
(train_df, test_df) = data_features_df.randomSplit([0.80,0.20])

In [30]:
### GRADIENT BOOSTING ##############################################################################################
gbt                = GBTRegressor(featuresCol='features',labelCol='budgetvalue', maxDepth=4, maxIter=40)
gbt_model          = gbt.fit(train_df)
print("### MODEL TRAINED")
gbt_pred           = gbt_model.transform(test_df)
print("### MODEL PREDICTED")
gbt_evaluator_rmse = RegressionEvaluator(labelCol='budgetvalue', predictionCol='prediction', metricName='rmse')
gbt_rmse           = gbt_evaluator_rmse.evaluate(gbt_pred)
gbt_evaluator_r2   = RegressionEvaluator(labelCol='budgetvalue', predictionCol='prediction', metricName='r2')
gbt_r2             = gbt_evaluator_r2.evaluate(gbt_pred)

print('### GRADIENT BOOSTED RMSE:')
print(gbt_rmse)
print('### GRADIENT BOOSTED r2:')
print(gbt_r2)

### MODEL TRAINED
### MODEL PREDICTED
### GRADIENT BOOSTED RMSE:
287649740.6247738
### GRADIENT BOOSTED r2:
0.9203895171923617


In [37]:
### SAVING THE MODEL
gbt_model.save('../gbt_model/')

In [99]:
### MLEAP SERIALIZATION
#gbt_model.serializeToBundle("jar:file:jars/movies_gbt.zip", gbt_model.transform(data_features_df))

In [34]:
### RANDOM FOREST ##################################################################################################
rf = RandomForestRegressor(featuresCol='features', labelCol='budgetvalue', numTrees=15)

rf_model        = rf.fit(train_df)
rf_pred         = rf_model.transform(test_df)
rf_evaluator    = RegressionEvaluator(labelCol='budgetvalue', predictionCol='prediction', metricName='rmse')
rf_rmse         = rf_evaluator.evaluate(rf_pred)
rf_evaluator_r2 = RegressionEvaluator(labelCol='budgetvalue', predictionCol='prediction', metricName='r2')
rf_r2           = rf_evaluator_r2.evaluate(rf_pred)

print('### RANDOM FOREST RMSE:')
print(rf_rmse)
print('### RANDOM FOREST r2:')
print(rf_r2)
#rf_pred.withColumn('pred', rf_pred['prediction'].cast(DecimalType(30,2))).show(truncate=False)

### RANDOM FOREST RMSE:
606585637.5503325
### RANDOM FOREST r2:
0.645981033089057


In [32]:
### DECISION TREE ###################################################################################################
dt              = DecisionTreeRegressor(featuresCol='features', labelCol='budgetvalue')
dt_model        = dt.fit(train_df)
dt_pred         = dt_model.transform(test_df)
dt_evaluator    = RegressionEvaluator(labelCol='budgetvalue', predictionCol='prediction', metricName='rmse')
dt_rmse         = dt_evaluator.evaluate(dt_pred)
dt_evaluator_r2 = RegressionEvaluator(labelCol='budgetvalue', predictionCol='prediction', metricName='r2')
dt_r2           = dt_evaluator_r2.evaluate(dt_pred)

print('### DECISION TREE RMSE:')
print(dt_rmse)
print('### RANDOM FOREST r2:')
print(dt_r2)

### DECISION TREE RMSE:
262467966.77666622
### RANDOM FOREST r2:
0.9337181077789628


In [35]:
from pyspark.sql.types import DecimalType
print('DECISION TREE #########################')
dt_pred.selectExpr('idmovie','director','genres','rating','budgetvalue', 'cast(prediction as Decimal(38,0)) as prediction').show(10)
#################################################################################
print('RANDOM FOREST #########################')
rf_pred.selectExpr('idmovie','director','genres','rating','budgetvalue', 'cast(prediction as Decimal(38,0)) as prediction').show(10)
#################################################################################
print('GRADIENT BOOSTED #########################')
gbt_pred.selectExpr('idmovie','director','genres','rating','budgetvalue', 'cast(prediction as Decimal(38,0)) as prediction').show(10)


DECISION TREE #########################
+-------+---------------+---------+------+-----------+----------+
|idmovie|       director|   genres|rating|budgetvalue|prediction|
+-------+---------------+---------+------+-----------+----------+
|      1|  John Lasseter|Adventure|  3.89|   30000000|  32626287|
|      1|  John Lasseter| Children|  3.89|   30000000|  32626287|
|      3|  Howard Deutch|   Comedy|  3.17|   25000000|  32626287|
|     10|Martin Campbell|   Action|  3.43|   60000000|  79045324|
|     11|     Rob Reiner|  Romance|  3.66|   62000000|  79045324|
|     15|   Renny Harlin|  Romance|  2.73|   98000000|  79045324|
|     19| Steve Oedekerk|   Comedy|  2.64|   30000000|  32626287|
|     23| Richard Donner|    Crime|  3.16|   50000000|  32626287|
|     43|Michael Hoffman|    Drama|  3.47|   19000000|   5827821|
|     50|   Bryan Singer| Thriller|  4.29|    6000000|   5827821|
+-------+---------------+---------+------+-----------+----------+
only showing top 10 rows

RANDOM FOR

In [36]:
print(gbt_model.toDebugString)

GBTRegressionModel (uid=GBTRegressor_7f39628e7341) with 40 trees
  Tree 0 (weight 1.0):
    If (feature 1225 in {0.0})
     If (feature 3996 in {0.0})
      If (feature 1 <= 1.825E8)
       If (feature 1 <= 5.95E7)
        Predict: 1.3283135428387718E7
       Else (feature 1 > 5.95E7)
        Predict: 9.814174126637554E7
      Else (feature 1 > 1.825E8)
       If (feature 684 in {0.0})
        Predict: 7.30041992825986E8
       Else (feature 684 not in {0.0})
        Predict: 3.0E10
     Else (feature 3996 not in {0.0})
      Predict: 2.0E10
    Else (feature 1225 not in {0.0})
     Predict: 3.5E10
  Tree 1 (weight 0.1):
    If (feature 2837 in {0.0})
     If (feature 109 in {0.0})
      If (feature 2084 in {0.0})
       If (feature 375 in {0.0})
        Predict: -1.0240254120664967E7
       Else (feature 375 not in {0.0})
        Predict: 5.2881429534874525E9
      Else (feature 2084 not in {0.0})
       Predict: 8.539916014348027E9
     Else (feature 109 not in {0.0})
      If (featu

In [97]:
import pandas as pd
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

ExtractFeatureImp(gbt_model.featureImportances, train_df, "features").head(50)
#print(rf_model.featureImportances)

Unnamed: 0,idx,name,score
0,0,continuous_features_rating,0.249237
374,374,director_indexed_encoded_Chan-wook Park,0.050678
4424,4424,genres_indexed_encoded_Mystery,0.033493
108,108,director_indexed_encoded_Bong Joon Ho,0.032679
4417,4417,genres_indexed_encoded_Action,0.022335
2083,2083,director_indexed_encoded_Ajay Pannalal,0.021404
3976,3976,director_indexed_encoded_Jung Huh,0.020639
2955,2955,director_indexed_encoded_Lajos Koltai,0.020469
17,17,director_indexed_encoded_Hayao Miyazaki,0.020276
835,835,director_indexed_encoded_Rohit Shetty,0.019701


In [None]:
### PMML EXAMPLE
#formula            = RFormula(formula = "budgetvalue ~ .")
#gbt                = GBTRegressor(featuresCol='features', labelCol='budgetvalue',seed=9)
#join_pipeline_gbt  = [formula, gbt]
#gbt_pipeline       = Pipeline(stages = join_pipeline_gbt)
#gbt_final          = gbt_pipeline.fit(train_df_2) 
#
#
#### EXPORTING THE MODEL
#PMMLBuilder(sc, data_df, gbt_final) \
#    .putOption(gbt, "compact", True) \
#    .putOption(gbt, "keep_predictionCol", True) \
#    .buildFile("gbt_budget.pmml")