In [1]:
# Initializing a Spark session
from pyspark.sql import SparkSession
import time
spark = SparkSession.builder.master("local").appName("flight").config("spark.some.config.option","some-value").getOrCreate()

In [2]:
start_time = time.time()
raw_data = spark.read.csv('s3://516ml/combined_csv.csv',
                    header='true', inferSchema='true')
print("--- %s seconds ---" % (time.time() - start_time))

--- 32.1862530708 seconds ---


In [3]:
import numpy as np
from pyspark.sql.functions import when
raw_data=raw_data.withColumn("ArrDelay",when(raw_data.ArrDelay > 0, 1.0).otherwise(0.0))
cols = ["DayOfWeek","DepTime","AirTime", "ArrDelay", "DepDelay", "Distance", "CarrierDelay", "WeatherDelay", "SecurityDelay", "Cancelled"]
from pyspark.sql import functions as F

for col in raw_data.columns:
     raw_data= raw_data.withColumn(col,F.col(col).cast("float"))

In [4]:
from pyspark.ml.feature import Imputer

imputer=Imputer(inputCols=cols,outputCols=cols)
model=imputer.fit(raw_data)
raw_data=model.transform(raw_data)

In [5]:
cols.remove("ArrDelay")
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")
# Now let us use the transform method to transform our dataset
raw_data=assembler.transform(raw_data)

In [6]:
from pyspark.ml.feature import StandardScaler
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
raw_data=standardscaler.fit(raw_data).transform(raw_data)

In [7]:
train, test = raw_data.randomSplit([0.8, 0.2], seed=12345)

In [8]:
dataset_size=float(train.select("ArrDelay").count())
numPositives=train.select("ArrDelay").where('ArrDelay == 1').count()
per_ones=(float(numPositives)/float(dataset_size))*100
numNegatives=float(dataset_size-numPositives)

In [9]:
BalancingRatio= numNegatives/dataset_size


In [10]:
from pyspark.sql.functions import when
train=train.withColumn("classWeights", when(train.ArrDelay == 1,BalancingRatio).otherwise(1-BalancingRatio))


In [11]:
from pyspark.ml.classification import LogisticRegression
start_time = time.time()
lr = LogisticRegression(labelCol="ArrDelay", featuresCol="Scaled_features",weightCol="classWeights",maxIter=10)
model = lr.fit(train)    

print("--- %s seconds ---" % (time.time() - start_time))
predict_train=model.transform(train)
predict_test=model.transform(test)

--- 189.773783922 seconds ---


In [12]:
# The BinaryClassificationEvaluator uses areaUnderROC as the default metric. As o fnow we will continue with the same
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="ArrDelay")

In [None]:
print("The area under ROC for train set is {}".format(evaluator.evaluate(predict_train)))
print("The area under ROC for test set is {}".format(evaluator.evaluate(predict_test)))

The area under ROC for train set is 0.906081833739


In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = ParamGridBuilder()\
    .addGrid(lr.aggregationDepth,[2,5,10])\
    .addGrid(lr.elasticNetParam,[0.0, 0.5, 1.0])\
    .addGrid(lr.fitIntercept,[False, True])\
    .addGrid(lr.maxIter,[10, 100, 1000])\
    .addGrid(lr.regParam,[0.01, 0.5, 2.0]) \
    .build()

# https://spark.apache.org/docs/2.1.0/ml-tuning.html

# K-fold cross validation

In [None]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
start_time = time.time()
# Run cross validations
cvModel = cv.fit(train)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing
# predict_train=cvModel.transform(train)
# predict_test=cvModel.transform(test)
print("--- %s seconds ---" % (time.time() - start_time))

# print("The area under ROC for train set after CV  is {}".format(evaluator.evaluate(predict_train)))
# print("The area under ROC for test set after CV  is {}".format(evaluator.evaluate(predict_test)))

In [16]:
print((raw_data.count(), len(raw_data.columns)))

(11534325, 31)
