In [1]:
from pathlib import Path
from Pipeline.Processor import Processor
from Modelling.H2OSparkAutoML import H2OSparkAutoML
from Pipeline.Components import RemoveColumns, CastToInt, ExtractTextFeatures, CleanUp
from pyspark.ml.feature import StandardScaler, VectorAssembler, QuantileDiscretizer
from petastorm.unischema import Unischema, UnischemaField
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
pipeline = Processor("jdbc:postgresql:GitHubData", "program", "DatabaseAccess", load_percent=10)

In [3]:
pipeline.raw_data.groupby("stars").count().orderBy("count", ascending=False).show()

+-----+-------+
|stars|  count|
+-----+-------+
|    0|4028273|
|    1| 240455|
|    2|  63891|
|    3|  30552|
|    4|  18236|
|    5|  12535|
|    6|   8930|
|    7|   6918|
|    8|   5547|
|    9|   4458|
|   10|   3758|
|   11|   3141|
|   12|   2618|
|   13|   2345|
|   14|   2123|
|   15|   1790|
|   16|   1690|
|   17|   1498|
|   18|   1385|
|   19|   1204|
+-----+-------+
only showing top 20 rows



In [4]:
# Remove alternative popularity metrics (which aren't used)
drop = RemoveColumns("id", "forkcount", "watchs", "users", "issues", "pullrequests", "commits")

# Convert boolean values into integers (0's and 1's) to avoid problems when testing (caused by type casting)
cast_to_int = CastToInt("fork", "locked", "archived")

# Feature scaling
assembler = VectorAssembler(inputCols=["stars"], outputCol="starsVector")
scaler = StandardScaler(inputCol="starsVector", outputCol="target")

# Temporarily remove all text features (until they're used) and clean vector column
# drop.columns = [*drop.columns, "name", "description", "topics", "readme"]
extract_text_features = ExtractTextFeatures("readme")
drop_new = RemoveColumns("readme", "titles", "name", "description", "topics")
cleanup = CleanUp("target")

In [5]:
pipeline.process_data(drop, cast_to_int, assembler, scaler, extract_text_features, drop_new, cleanup, show=True)

+----------+----------------+-------------------+-------------------+----+--------+------+-----+--------------------+------+
|   license| primarylanguage|        createddate|     lastpusheddate|fork|archived|locked|split|              target|length|
+----------+----------------+-------------------+-------------------+----+--------+------+-----+--------------------+------+
|       mit|Jupyter Notebook|2016-06-18 00:51:28|2016-08-19 17:33:09|   0|       0|     0|  5.0| 0.05115268489844716|  5667|
|       mit|            Java|2016-07-13 18:31:17|2017-06-12 01:41:16|   0|       0|     0|  6.0|  0.6342932927407448|  3164|
|     other|             CSS|2016-07-13 18:53:48|2020-06-14 07:38:04|   0|       0|     0|  1.0|                 0.0|   538|
|apache-2.0|              Go|2016-07-13 18:31:01|2017-05-24 00:43:35|   0|       0|     0|  4.0|0.030691610939068296|   247|
|apache-2.0|            Java|2016-07-13 18:32:19|2018-05-03 00:27:30|   0|       0|     0|  1.0|                 0.0|   250|


In [None]:
# Train models
automl = H2OSparkAutoML(pipeline.train_data, pipeline.test_data)
model, results = automl.train_models(max_runtime=10)

In [None]:
results.show()

In [None]:
automl.get_leaderboard();

In [None]:
automl.visualise_best_model()