In [78]:
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import col, udf, split, row_number
from pyspark.sql.window import Window
from pyspark.sql.types import DateType, StringType
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors, DenseVector

In [2]:
from datetime import datetime

In [4]:
sc = SparkContext()

In [5]:
sqlc = SQLContext(sc)

In [58]:
reading = sqlc.read.csv('Admissions 2015-16.csv', header=True)

In [59]:
reading.show(5)

+------------+--------+---------------+-----------+-------------+--------------+--------+------------+
|Removal Date|Hospital|      Specialty|  Procedure|       Doctor|Patient Number|Priority|Waiting Days|
+------------+--------+---------------+-----------+-------------+--------------+--------+------------+
|  03/08/2015|     Mel|General Surgery|Lap Banding|Joseph Miller|     111365825|       C|        3396|
|  06/07/2015|     Mel|General Surgery|Lap Banding|Joseph Miller|     109970143|       C|        3356|
|  03/08/2015|     Mel|General Surgery|Lap Banding|Joseph Miller|     106770523|       C|        3244|
|  19/08/2015|     Mel|General Surgery|Lap Banding|Joseph Miller|     111176864|       C|        3229|
|  09/08/2015|     Mel|General Surgery|Lap Banding|Joseph Miller|     107085813|       C|        3190|
+------------+--------+---------------+-----------+-------------+--------------+--------+------------+
only showing top 5 rows



In [69]:
change_to_month_func = udf(lambda record: datetime.strftime(datetime.strptime(record, '%d/%m/%Y'), '%m-%Y'), StringType())

In [70]:
reading_mod = reading.withColumn('Date', change_to_month_func(col('Removal Date'))).drop('Removal Date').withColumnRenamed('count(Removal Date)', 'patients_removed')

In [71]:
grouped = reading_mod.groupby('Date').agg({'Date': 'count'})

In [72]:
grouped.orderBy('Date').show(100)

+-------+-----------+
|   Date|count(Date)|
+-------+-----------+
|01-2016|       1094|
|02-2016|       1443|
|03-2016|       1364|
|04-2016|       1808|
|05-2016|       2200|
|06-2015|         86|
|06-2016|       2006|
|07-2015|       1217|
|08-2015|       1230|
|09-2015|       1305|
|10-2015|       1191|
|11-2015|       1384|
|12-2015|       1143|
+-------+-----------+



In [92]:
change_to_date_func = udf(lambda record: datetime.strptime(record, '%m-%Y'), DateType())

In [93]:
grouped_with_date = grouped.withColumn('Date', change_to_date_func(col('Date')))

In [94]:
window_row = Window().orderBy('Date')

In [95]:
grouped_new = grouped_with_date.withColumn('id', row_number().over(window_row))

In [97]:
grouped_new.show(100)

+----------+-----------+---+
|      Date|count(Date)| id|
+----------+-----------+---+
|2015-06-01|         86|  1|
|2015-07-01|       1217|  2|
|2015-08-01|       1230|  3|
|2015-09-01|       1305|  4|
|2015-10-01|       1191|  5|
|2015-11-01|       1384|  6|
|2015-12-01|       1143|  7|
|2016-01-01|       1094|  8|
|2016-02-01|       1443|  9|
|2016-03-01|       1364| 10|
|2016-04-01|       1808| 11|
|2016-05-01|       2200| 12|
|2016-06-01|       2006| 13|
+----------+-----------+---+

