In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, udf, split, row_number, last, max
from pyspark.sql.window import Window
from pyspark.sql.types import DateType, StringType, IntegerType
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, IsotonicRegression, RandomForestRegressor
from pyspark.ml.linalg import Vectors, DenseVector, VectorUDT
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
from datetime import datetime, timedelta, date

In [3]:
sc = SparkContext()
sqlc = SparkSession(sc)

In [4]:
change_to_month_func = udf(lambda record: int(datetime.strftime(datetime.strptime(record, '%d/%m/%Y'), '%Y%m')), IntegerType())
change_to_date_func = udf(lambda record: datetime.strptime(str(record), '%Y%m'), DateType())
to_vector = udf(lambda record: Vectors.dense(record), VectorUDT())
to_vectors = udf(lambda col_a, col_b: Vectors.sparse(col_a, col_b))

In [5]:
reading = sqlc.read.csv('Admissions 2015-16.csv', header=True)
reading_mod = reading.withColumn('Date', change_to_month_func(col('Removal Date'))).drop('Removal Date'
                                        ).withColumnRenamed('count(Removal Date)', 'patients_removed')

In [None]:
reading_mod.show(5)

In [6]:
grouped = reading_mod.groupby('Date').agg({'Date': 'count'})
grouped_with_date = grouped.withColumn('Date', change_to_date_func(col('Date')))
window_row = Window().orderBy('Date')
grouped_new = grouped_with_date.withColumn('id', row_number().over(window_row))

In [8]:
grouped_new_1 = grouped_new.withColumn('id', to_vector(col('id')))
testing_df = grouped_new_1.where(col('Date') > datetime(2016,4,2))
training_df = grouped_new_1.filter(col('Date') < datetime(2016,6,2))

In [8]:
lr = LinearRegression(maxIter=100, regParam=0.01, elasticNetParam = 1.0, labelCol="count(Date)", featuresCol="id")
model_lr = lr.fit(training_df.select('count(Date)', 'id'))
model_lr.coefficients
transformed_lr = model_lr.transform(testing_df)

In [8]:
dt = DecisionTreeRegressor(maxDepth=3, labelCol="count(Date)", featuresCol="id")
model_dt = dt.fit(training_df.select('count(Date)', 'id'))
transformed_dt = model_dt.transform(testing_df)

In [9]:
dt = DecisionTreeRegressor()
params = ParamGridBuilder().baseOn({dt.labelCol: "count(Date)"}).baseOn({dt.featuresCol: "id"}).addGrid(
                                    dt.maxDepth, [3, 5, 7]).build()
regev_dt = RegressionEvaluator(labelCol='count(Date)')
# regev_dtk.evaluate(transformed_dtk.select("count(Date)", "prediction"), {regev_dtk.metricName: "mae"})
cv = CrossValidator(estimator=dt, estimatorParamMaps=params, evaluator=regev_dt)
cv_model = cv.fit(training_df.select('count(Date)', 'id'))
transformed_dt_cv = cv_model.transform(testing_df)
regev_dt.evaluate(transformed_dt_cv, {regev_dt.metricName: "mae"})

97.0

In [23]:
import os 
transformed_dt_cv.select('Date', 'prediction').write.format('csv').save(os.path.join(os.curdir, 'predictions'), mode='overwrite', header=True)

In [12]:
transformed_dt_cv.select('Date', to_vector(col('id')), 'prediction').show(10)

+----------+------------+----------+
|      Date|<lambda>(id)|prediction|
+----------+------------+----------+
|2016-05-01|      [12.0]|    2103.0|
|2016-06-01|      [13.0]|    2103.0|
+----------+------------+----------+



In [None]:
iso = IsotonicRegression(labelCol="count(Date)", featuresCol="id")
model_iso = iso.fit(training_df.select("count(Date)", "id"))
transformed_iso = model_iso.transform(testing_df)

In [None]:
rf = RandomForestRegressor(numTrees=3, maxDepth=5,labelCol="count(Date)", featuresCol="id")
model_rf = rf.fit(training_df.select("count(Date)", "id"))
transformed_rd = model_rf.transform(testing_df)
transformed_rd.show(10)

In [6]:
grouped_doctor = reading_mod.groupby('Date', 'Doctor').agg({'Doctor': 'count'})
grouped_doctor_with_date = grouped_doctor.withColumn('Date', change_to_date_func(col('Date')))
window_row_doctor = Window().partitionBy('Doctor').orderBy('Date')
grouped_doctor_with_date_new = grouped_doctor_with_date.withColumn('id', row_number().over(window_row_doctor))

In [7]:
strindexer = StringIndexer(inputCol="Doctor", outputCol="Doctor_idx")
model_strindexer = strindexer.fit(grouped_doctor_with_date_new)
grouped_doctor_with_date_new_indexed = model_strindexer.transform(grouped_doctor_with_date_new)
assembler = VectorAssembler(inputCols=["id", "Doctor_idx"], outputCol="features")
grouped_doctor_with_date_new_1 = assembler.transform(grouped_doctor_with_date_new_indexed)

In [8]:
training_df = grouped_doctor_with_date_new_1
testing_df = grouped_doctor_with_date_new_1.where(col('Date') > datetime(2016,4,2))

In [None]:
lrd = LinearRegression(maxIter=1000, regParam=0.01, elasticNetParam=0.0, labelCol="count(Doctor)", featuresCol="features")
model_lrd = lrd.fit(training_df.select("features", "count(Doctor)"))
transformed_lrd = model_lrd.transform(testing_df)

In [None]:
regev = RegressionEvaluator(labelCol="count(Doctor)")
regev.evaluate(transformed_lrd.select("count(Doctor)", "prediction"), {regev.metricName: "mae"})

In [None]:
isod = IsotonicRegression(labelCol="count(Doctor)", featuresCol="features")
model_isod = isod.fit(training_df.select("features", "count(Doctor)"))
transformed_isod = model_isod.transform(testing_df)
transformed_isod.show(100)

In [None]:
grouped_doctor_with_date_new_1.show(10)

In [None]:
tr = grouped_doctor_with_date_new_1.where(col('Doctor_idx') >= 0.0).where(col(
        'Doctor_idx') < 2.0)
te = grouped_doctor_with_date_new_1.where(col('Date') > datetime(2016,4,2)).where(col('Doctor_idx') >= 0.0
        ).where(col('Doctor_idx') < 2.0)
dtk = DecisionTreeRegressor(maxDepth=32, labelCol="count(Doctor)", featuresCol="features")
model_dtk = dtk.fit(tr)
transformed_dtk = model_dtk.transform(te)

In [None]:
#max_val_doctor = grouped_doctor_with_date_new_1.select('Doctor_idx').rdd.max()[0]

training_doctor_df = grouped_doctor_with_date_new_1.where(col('Date') < datetime(2016,4,2))
testing_doctor_df = grouped_doctor_with_date_new_1.where(col('Date') > datetime(2016,4,2))


dtk = DecisionTreeRegressor(maxDepth=3, labelCol="count(Doctor)", featuresCol="features", maxBins=178)
model_dtk = dtk.fit(training_doctor_df)
transformed_dtk = model_dtk.transform(testing_doctor_df)
regev_dtk = RegressionEvaluator(labelCol="count(Doctor)")
regev_dtk.evaluate(transformed_dtk.select("count(Doctor)", "prediction"), {regev_dtk.metricName: "mae"})

In [None]:
transformed_dtk.show(100)

In [None]:
rfd = RandomForestRegressor(numTrees=10, maxDepth=3,labelCol="count(Doctor)", featuresCol="features", maxBins=178)
model_rfd = rfd.fit(training_doctor_df.select("count(Doctor)", "features"))
transformed_rfd = model_rfd.transform(testing_doctor_df)
regev_rfd = RegressionEvaluator(labelCol="count(Doctor)")
regev_rfd.evaluate(transformed_rfd.select("count(Doctor)", "prediction"), {regev_rfd.metricName: "r2"})
#transformed_rfd.show(10)

In [None]:
grouped_specialty = reading_mod.groupby('Date', 'Specialty').agg({'Specialty': 'count'})
grouped_specialty_with_date = grouped_specialty.withColumn('Date', change_to_date_func(col('Date')))
window_row_specialty = Window().partitionBy('Specialty').orderBy('Date')
grouped_specialty_with_date_new = grouped_specialty_with_date.withColumn('id', 
                                                                         row_number().over(window_row_specialty))
strindexer = StringIndexer(inputCol="Specialty", outputCol="Specialty_idx")
model_strindexer = strindexer.fit(grouped_specialty_with_date_new)
grouped_specialty_with_date_new_indexed = model_strindexer.transform(grouped_specialty_with_date_new)
assembler = VectorAssembler(inputCols=["id", "Specialty_idx"], outputCol="features")
grouped_specialty_with_date_new_1 = assembler.transform(grouped_specialty_with_date_new_indexed)
training_specialty_df = grouped_specialty_with_date_new_1
testing_specialty_df = grouped_specialty_with_date_new_1.where(col('Date') > datetime(2016,4,2))
lrs = LinearRegression(maxIter=1000, regParam=0.01, elasticNetParam=0.0, labelCol="count(Specialty)", 
                       featuresCol="features")
model_lrs = lrs.fit(training_specialty_df.select("features", "count(Specialty)"))
transformed_lrs = model_lrs.transform(testing_specialty_df)
regev_s = RegressionEvaluator(labelCol="count(Specialty)")
regev_s.evaluate(transformed_lrs.select("count(Specialty)", "prediction"), {regev_s.metricName: "mae"})

In [None]:
transformed_lrs.show(100)

In [None]:
isos = IsotonicRegression(labelCol="count(Specialty)", featuresCol="features")
model_isos = isos.fit(training_specialty_df.select("features", "count(Specialty)"))
transformed_isos = model_isos.transform(testing_specialty_df)
regev_s = RegressionEvaluator(labelCol="count(Specialty)")
regev_s.evaluate(transformed_isos.select("count(Specialty)", "prediction"), {regev_s.metricName: "mae"})

In [None]:
transformed_isos.show(100)

In [None]:
dts = DecisionTreeRegressor(maxDepth=3, labelCol="count(Specialty)", featuresCol="features")
model_dts = dts.fit(training_specialty_df.select('count(Specialty)', 'features'))
transformed_dts = model_dts.transform(testing_specialty_df)
regev_dts = RegressionEvaluator(labelCol="count(Specialty)")
regev_dts.evaluate(transformed_dts.select("count(Specialty)", "prediction"), {regev_dts.metricName: "mae"})

In [None]:
transformed_dts.show(100)

In [None]:
grouped_hospital = reading_mod.groupby('Date', 'Hospital').agg({'Hospital': 'count'})
grouped_hospital_with_date = grouped_hospital.withColumn('Date', change_to_date_func(col('Date')))
window_row_hospital = Window().partitionBy('Hospital').orderBy('Date')
grouped_hospital_with_date_new = grouped_hospital_with_date.withColumn('id', 
                                                                         row_number().over(window_row_hospital))
strindexer = StringIndexer(inputCol="Hospital", outputCol="Hospital_idx")
model_strindexer = strindexer.fit(grouped_hospital_with_date_new)
grouped_hospital_with_date_new_indexed = model_strindexer.transform(grouped_hospital_with_date_new)
assembler = VectorAssembler(inputCols=["id", "Hospital_idx"], outputCol="features")
grouped_hospital_with_date_new_1 = assembler.transform(grouped_hospital_with_date_new_indexed)
training_hospital_df = grouped_hospital_with_date_new_1
testing_hospital_df = grouped_hospital_with_date_new_1.where(col('Date') > datetime(2016,4,2))
lrh = LinearRegression(maxIter=1000, regParam=0.0, elasticNetParam=1.0, labelCol="count(Hospital)", 
                       featuresCol="features")
model_lrh = lrh.fit(training_hospital_df.select("features", "count(Hospital)"))
transformed_lrh = model_lrh.transform(testing_hospital_df)
regev_h = RegressionEvaluator(labelCol="count(Hospital)")
regev_h.evaluate(transformed_lrh.select("count(Hospital)", "prediction"), {regev_s.metricName: "mae"})

In [None]:
transformed_lrh.show(100)

In [None]:
isoh = IsotonicRegression(labelCol="count(Hospital)", featuresCol="features")
model_isoh = isoh.fit(training_hospital_df.select("features", "count(Hospital)"))
transformed_isoh = model_isoh.transform(testing_hospital_df)
regev_h = RegressionEvaluator(labelCol="count(Hospital)")
regev_h.evaluate(transformed_isoh.select("count(Hospital)", "prediction"), {regev_s.metricName: "mae"})

In [None]:
transformed_isoh.show(100)

In [None]:
dth = DecisionTreeRegressor(maxDepth=3, labelCol="count(Hospital)", featuresCol="features")
model_dth = dth.fit(training_hospital_df.select('count(Hospital)', 'features'))
transformed_dth = model_dth.transform(testing_hospital_df)
regev_dth = RegressionEvaluator(labelCol="count(Hospital)")
regev_dth.evaluate(transformed_dth.select("count(Hospital)", "prediction"), {regev_dth.metricName: "mae"})

In [None]:
transformed_dth.show(100)

In [None]:
grouped_proc = reading_mod.groupby('Date', 'Procedure').agg({'Procedure': 'count'})
grouped_proc_with_date = grouped_proc.withColumn('Date', change_to_date_func(col('Date')))
window_row_proc = Window().partitionBy('Procedure').orderBy('Date')
grouped_proc_with_date_new = grouped_proc_with_date.withColumn('id', row_number().over(window_row_proc))
strindexer = StringIndexer(inputCol="Procedure", outputCol="Procedure_idx")
model_strindexer = strindexer.fit(grouped_proc_with_date_new)
grouped_proc_with_date_new_indexed = model_strindexer.transform(grouped_proc_with_date_new)
assembler = VectorAssembler(inputCols=["id", "Procedure_idx"], outputCol="features")
grouped_proc_with_date_new_1 = assembler.transform(grouped_proc_with_date_new_indexed)
training_proc_df = grouped_proc_with_date_new_1
testing_proc_df = grouped_proc_with_date_new_1.where(col('Date') > datetime(2016,4,2))
dtproc = DecisionTreeRegressor(maxDepth=5, labelCol="count(Procedure)", featuresCol="features", maxBins=128)
model_dtproc = dtproc.fit(training_proc_df.select('count(Procedure)', 'features'))
transformed_dtproc = model_dtproc.transform(testing_proc_df)
regev_dtproc = RegressionEvaluator(labelCol="count(Procedure)")
regev_dtproc.evaluate(transformed_dtproc.select("count(Procedure)", "prediction"), {regev_dtproc.metricName: "mae"})

In [None]:
transformed_dtproc.show(10)

In [22]:
grouped_pr = reading_mod.groupby('Date', 'Priority').agg({'Priority': 'count'})
grouped_pr_with_date = grouped_pr.withColumn('Date', change_to_date_func(col('Date')))
window_row_pr = Window().partitionBy('Priority').orderBy('Date')
grouped_pr_with_date_new = grouped_pr_with_date.withColumn('id', row_number().over(window_row_pr))
strindexer = StringIndexer(inputCol="Priority", outputCol="Priority_idx")
model_strindexer = strindexer.fit(grouped_pr_with_date_new)
grouped_pr_with_date_new_indexed = model_strindexer.transform(grouped_pr_with_date_new)
assembler = VectorAssembler(inputCols=["id", "Priority_idx"], outputCol="features")
grouped_pr_with_date_new_1 = assembler.transform(grouped_pr_with_date_new_indexed)
training_pr_df = grouped_pr_with_date_new_1
testing_pr_df = grouped_pr_with_date_new_1.where(col('Date') > datetime(2016,4,2))
dtpr = DecisionTreeRegressor(maxDepth=3, labelCol="count(Priority)", featuresCol="features")
model_dtpr = dtpr.fit(training_pr_df.select('count(Priority)', 'features'))
transformed_dtpr = model_dtpr.transform(testing_pr_df)
regev_dtpr = RegressionEvaluator(labelCol="count(Priority)")
regev_dtpr.evaluate(transformed_dtpr.select("count(Priority)", "prediction"), {regev_dtpr.metricName: "r2"})

NameError: name 'reading_mod' is not defined

In [None]:
transformed_dtpr.show(100)

In [None]:
training_df.show(10)

In [11]:
last_date = training_df.orderBy('Date').select(last('Date')).collect()[0][0]
#prediction_dates = [date(last_date.year, last_date.month + i, 1) for i in xrange(1, 3 + 1)]

last_id = training_df.orderBy('id').select(max('id')).collect()[0][0]
prediction_ids = list(Row(float(last_id + i), date(last_date.year, last_date.month + i, 1)) for i in range(1, 3 + 1))

#prediction_rdd = sc.parallelize(prediction_ids)
prediction_df = sqlc.createDataFrame(prediction_ids, ['id', 'Date'])

In [None]:
prediction_ids

In [None]:
training_df.orderBy('id').select(max('id')).collect()[0][0][0]

In [None]:
prediction_df = sqlc.createDataFrame([[14.0], [15.0], [16.0]], ['id'])

In [12]:
prediction_df.select(to_vector(col('id'))).show(1)

+------------+
|<lambda>(id)|
+------------+
|      [14.0]|
+------------+
only showing top 1 row



In [1]:
from admissions import predict_admissions

In [2]:
df = predict_admissions('Admissions 2015-16.csv', predict_period=4)

In [3]:
df.show()

+----------+------+----------+
|      Date|    id|prediction|
+----------+------+----------+
|2016-07-01|[14.0]|    2103.0|
|2016-08-01|[15.0]|    2103.0|
|2016-09-01|[16.0]|    2103.0|
+----------+------+----------+



In [5]:
x = [Row(max(id)=13)]

SyntaxError: keyword can't be an expression (<ipython-input-5-3831f2340711>, line 1)

In [4]:
all_df = df.collect()

In [7]:
len(all_df)

3

In [21]:
for i in xrange(0, len(all_df)):
    print "Prediction for %s-%s is %s" %(all_df[i]['Date'].month, all_df[i]['Date'].year, str(int(all_df[i]['prediction'])))

Prediction for 7-2016 is 2103
Prediction for 8-2016 is 2103
Prediction for 9-2016 is 2103


In [19]:
val = 'm'
print 'hi %s %ss' %(val, val)

hi m ms


In [9]:
sp = training_df.select('Doctor', 'Doctor_idx')

In [13]:
new = prediction_df.join(sp)

In [15]:
new.show(100)

+----+----------+-----------------+----------+
|  id|      Date|           Doctor|Doctor_idx|
+----+----------+-----------------+----------+
|14.0|2016-07-01|   Nathan Russell|      67.0|
|14.0|2016-07-01|   Carol Buckland|      82.0|
|14.0|2016-07-01| Connor Churchill|      64.0|
|14.0|2016-07-01|        Owen Gill|      45.0|
|14.0|2016-07-01|    Neil Rampling|      79.0|
|14.0|2016-07-01|   Jason Paterson|      97.0|
|14.0|2016-07-01|    Wendy Ellison|       8.0|
|14.0|2016-07-01|     Alan Coleman|      76.0|
|14.0|2016-07-01|   Brandon Fraser|      85.0|
|14.0|2016-07-01|    Sophie Miller|      20.0|
|14.0|2016-07-01|    Oliver Greene|     100.0|
|14.0|2016-07-01|      Owen Slater|      62.0|
|14.0|2016-07-01|     Julia Fisher|      88.0|
|14.0|2016-07-01|   Harry Springer|      44.0|
|14.0|2016-07-01|     Connor Brown|      41.0|
|14.0|2016-07-01|     Charles Bell|      90.0|
|14.0|2016-07-01|Alexander Coleman|      63.0|
|14.0|2016-07-01|   Blake Campbell|     111.0|
|14.0|2016-07