# MAST30034 Project 1
## Statistical Modelling

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols, glm

In [3]:
from functools import reduce 
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/14 18:11:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/14 18:11:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
taxi = spark.read.parquet('../../Project 1/DataFrames/taxi')
weather = spark.read.parquet('../../Project 1/DataFrames/weather')
taxi = taxi.withColumn("pickup_date",to_date(col("pickup_time")))
taxi = taxi.drop("pickup_time", "dropoff_time")
sdf = taxi.join(weather,taxi.pickup_date ==  weather.date,"inner")
sdf = sdf.drop('date')
sdf.write.parquet("../../Project 1/DataFrames/stats_modelling")

## Logistic regression

In [166]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import RFormula

#eatures = 'features'
#input_cols = ['fare_amount', 'passenger_count', 'pickup_location', 'trip_distance_km', 'temp', 'dew_point', 'pressure', 'wind_speed', 'wind_direction']

formula=RFormula(formula = "tip_amount ~ fare_amount + passenger_count + pickup_location + trip_distance_km + temp + dew_point + pressure + wind_speed + wind_direction", featuresCol= "features", labelCol= "label")
output = formula.fit(sdf).transform(sdf)
model_sdf = output.select("label","features")

model_sdf = model_sdf.withColumn("label", when(model_sdf["label"] > 0, 1).otherwise(model_sdf["label"]))

# Split the data into train and test
splits = model_sdf.randomSplit([0.8, 0.2], 1234)
train = splits[0]
test = splits[1]

# Fit the model
lrModel = LogisticRegression().fit(train)
lrModel.summary
# Print the coefficients and intercept for logistic regression
# print("Coefficients: " + str(lrModel.coefficients))
# print("Intercept: " + str(lrModel.intercept))

                                                                                

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x7f7d732b5f40>

In [None]:
fit_weather = ols(
    formula="tip_amount ~ temp + dew_point + pressure + wind_speed + wind_direction",
    data=df
).fit()

fit_taxi = ols(
    formula="tip_amount ~ pickup_location + passenger_count + fare_amount + trip_distance_km",
    data=df
).fit()

fit_all = ols(
    formula="tip_amount ~ temp + dew_point + pressure + wind_speed + wind_direction + pickup_location + passenger_count + fare_amount + trip_distance_km",
    data=df
).fit()


## Multilayer Perceptron

In [180]:
#sdf[['tip_amount']] <= 10)
(sdf.select('tip_amount').where(sdf.tip_amount == 0).count() / sdf.select('tip_amount').count())*100


34.220367188411906

As over 34% of people dont give a tip, for simplicity, the percentron will be trained to classify an instance as either 0 (didnt tip) or 1 (tipped)

In [7]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import RFormula
sdf = spark.read.parquet("../../Project 1/DataFrames/stats_modelling")
#formula=RFormula(formula = "tip_amount ~ fare_amount + passenger_count + pickup_location + trip_distance_km + temp + dew_point + pressure + wind_speed + wind_direction", featuresCol= "features", labelCol= "label")
formula=RFormula(formula = "tip_amount ~ fare_amount + trip_distance_km + temp + trip_time_min", featuresCol= "features", labelCol= "label")
output = formula.fit(sdf).transform(sdf)
model_sdf = output.select("label","features")
model_sdf = model_sdf.withColumn("label", when(model_sdf["label"] > 0, 1).otherwise(model_sdf["label"]))
# Load training data
model_sdf = model_sdf.withColumn("label",col('label').cast("int"))

                                                                                

In [14]:

# Split the data into train and test
splits = model_sdf.randomSplit([0.8, 0.2], 1)
train = splits[0]
test = splits[1]

                                                                                

In [None]:
layers = [4, 11, 9, 2]
trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, seed = 1)
model = trainer.fit(train)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(evaluator.evaluate(predictionAndLabels))

layers = [4, 11, 9, 7, 2]
trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, seed = 1)
model = trainer.fit(train)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(evaluator.evaluate(predictionAndLabels))

layers = [4, 11, 9, 7, 5, 2]
trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, seed = 1)
model = trainer.fit(train)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(evaluator.evaluate(predictionAndLabels))

layers = [4, 11, 9, 7, 5, 3, 2]
trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, seed = 1)
model = trainer.fit(train)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(evaluator.evaluate(predictionAndLabels))

The above shows that a smaller number of hidden layers is optimal for this neutral network

In [None]:
outputs = []
for i in range(5, 12, 2):
    for j in range(5, 12, 2):
        layers = [4, i, j, 2]
        trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, seed = 1)
        model = trainer.fit(train)
        result = model.transform(test)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        outputs.append(evaluator.evaluate(predictionAndLabels))
        print(evaluator.evaluate(predictionAndLabels))
print(outputs)

[0.6601338272229761, 0.6863059061696939, 0.6762855892135433, 0.6739921146359164, 0.6912520742346965, 0.6683100568205782, 0.6584308096387014, 0.6810819615388531, 0.6840938739359775, 0.6918410769004789, 0.6939643951524354, 0.6945868473523239, 0.6719982315376751, 0.6832823591520106, 0.6888291645033544, 0.6851671676825145]

In [None]:
outputs = []
for i in range(5, 12, 2):
    for j in range(5, 12, 2):
        layers = [4, 13, i, j, 2]
        trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, seed = 1)
        model = trainer.fit(train)
        result = model.transform(test)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        outputs.append(evaluator.evaluate(predictionAndLabels))
        print(evaluator.evaluate(predictionAndLabels))
print(outputs)

[0.6841971311934357, 0.6864775168229341, 0.6784278137239076, 0.6584308096387014, 0.6846799679466203, 0.6591521561133387, 0.6921828438793897, 0.6577356410603211, 0.680402790563741, 0.6810179711257804, 0.6584308096387014, 0.6854158576969559, 0.6876933346713147, 0.6615634312241221, 0.6798225138633776, 0.6833623471683515]

From the above outputs, it can been seen that the optimal neural network is: [4, 9, 11, 2]:

In [34]:
layers = [4, 9, 11, 2]
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, seed = 1)
model = trainer.fit(train)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(evaluator.evaluate(predictionAndLabels))



0.7223296000744616


                                                                                

- 5 fold cross validation
- think of a way to graph it