In [None]:
#Loretta Gray 7.7 Survival Regression Commented Hw6

In [1]:
'''
Initialize Spark and import necessary libraries:
- findspark: Helps find and initialize PySpark.
- SparkSession: The entry point for using Spark with DataFrames.
- Vectors: Provides feature vector utilities for machine learning.
- AFTSurvivalRegression: Implements Accelerated Failure Time (AFT) survival regression for time-to-event analysis.
'''

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import AFTSurvivalRegression


In [3]:
'''
Download the WHAS500 dataset from the web archive:
- urllib.request: Module for opening and reading URLs.
- urlretrieve: Downloads the file from the specified URL and saves it locally.
- The file is saved as 'whas500.txt' in the current working directory.
'''

import urllib.request

# Download the .txt file
url = "https://web.archive.org/web/20170517071528/http://www.umass.edu/statdata/statdata/data/whas500.txt"
urllib.request.urlretrieve(url, "whas500.txt")

print("File downloaded as whas500.txt")


File downloaded as whas500.txt


In [5]:
'''
Read the first 5 lines of the downloaded file to check its contents:
- open("whas500.txt", "r"): Opens the 'whas500.txt' file in read mode.
- file.readline().strip(): Reads each line and removes any leading/trailing whitespace.
- The loop runs 5 times to print the first 5 lines of the file.
'''

with open("whas500.txt", "r") as file:
    for _ in range(5):
        print(file.readline().strip())



<!DOCTYPE html>
<html>
<head>
<title>Wayback Machine</title>
<script src="//archive.org/includes/athena.js" type="text/javascript"></script>


In [7]:
'''
Initialize a Spark session:
- findspark.init(): Initializes the findspark module to locate Spark.
- SparkSession.builder.appName("Survival Regression"): Creates a Spark session named "Survival Regression" for your Spark application.
- getOrCreate(): Either retrieves an existing Spark session or creates a new one if none exists.
'''

import findspark
findspark.init()

from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("Survival Regression").getOrCreate()



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/13 04:00:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
'''
Load the downloaded dataset into a Spark DataFrame:
- spark.read.option("delimiter", " "): Specifies that the delimiter used in the dataset is a space (" ").
- .csv(): Reads the file 'whas500.txt' from the specified path and loads it into a Spark DataFrame.
- inferSchema=True: Automatically infers the data types for each column.
- header=True: Uses the first row as the header, which contains column names.
- df.show(5): Displays the first 5 rows of the loaded DataFrame to verify the content.
'''

df = spark.read.option("delimiter", " ").csv("/Users/ellegreyllc/Desktop/whas500.txt", inferSchema=True, header=True)
df.show(5)



                                                                                

+--------------+--------------------+
|     <!DOCTYPE|               html>|
+--------------+--------------------+
|        <html>|                NULL|
|        <head>|                NULL|
|<title>Wayback|     Machine</title>|
|       <script|src="//archive.or...|
|       <script|type="text/javasc...|
+--------------+--------------------+
only showing top 5 rows



In [11]:
'''
Create or retrieve a Spark session:
- SparkSession.builder: Initiates the process of creating a Spark session.
- .appName("Survival Regression"): Sets the name of the Spark session as "Survival Regression".
- .getOrCreate(): Retrieves the existing session if one exists, or creates a new session if none is active.
'''

spark = SparkSession \
    .builder \
    .appName("Survival Regression") \
    .getOrCreate()


In [13]:
'''
Logged Time To Failure Data
label = unit of time, say months, equipment fails censor = 1 means occured, 
say time (by the label) to failure, uncensored censor = 0 means censored, 
failure not occured, say time (by the label) to maintenance features contains
feature columns, such as machine age and temperature, more example such as

Haeat Attack Study: https://web.archive.org/web/20170517071528/http://www.umass.edu/statdata/statdata/data/whas500.txt
'''

'\nLogged Time To Failure Data\nlabel = unit of time, say months, equipment fails censor = 1 means occured, \nsay time (by the label) to failure, uncensored censor = 0 means censored, \nfailure not occured, say time (by the label) to maintenance features contains\nfeature columns, such as machine age and temperature, more example such as\n\nHaeat Attack Study: https://web.archive.org/web/20170517071528/http://www.umass.edu/statdata/statdata/data/whas500.txt\n'

In [15]:
'''
Create a training DataFrame for the survival regression model:
- spark.createDataFrame(): Creates a DataFrame from the given data.
- The data consists of tuples representing:
  - label: The time to failure (or event).
  - censor: 1.0 if the event (failure) occurred (uncensored), 0.0 if the event did not occur (censored).
  - features: A dense vector of feature values (e.g., machine age, temperature).
- .toDF("label", "censor", "features"): Assigns column names to the DataFrame.
'''

training = spark.createDataFrame((
    (1.218, 1.0, Vectors.dense(1.560, -0.605)), 
    (2.949, 0.0, Vectors.dense(0.346, 2.158)),
    (3.627, 0.0, Vectors.dense(1.380, 0.231)), 
    (0.273, 1.0, Vectors.dense(0.520, 1.151)), 
    (4.199, 0.0, Vectors.dense(0.795, -0.226))
)).toDF("label", "censor", "features")



In [17]:
training.show()

+-----+------+--------------+
|label|censor|      features|
+-----+------+--------------+
|1.218|   1.0| [1.56,-0.605]|
|2.949|   0.0| [0.346,2.158]|
|3.627|   0.0|  [1.38,0.231]|
|0.273|   1.0|  [0.52,1.151]|
|4.199|   0.0|[0.795,-0.226]|
+-----+------+--------------+



In [19]:
#Predict 2 quantile time to failure at 30% chance and at 60% chance

In [21]:
'''
Set the quantile probabilities for prediction:
- quantileProbabilities: A tuple that defines the probabilities at which we want to predict the time-to-event (failure or maintenance).
  - 0.3: 30% probability for the first quantile.
  - 0.9: 90% probability for the second quantile.
'''

quantileProbabilities = (0.3, 0.9)


In [23]:
#train the model with training data above with AFTSurvivalRegression

In [25]:
'''
Train the AFTSurvivalRegression model with specified parameters:
- AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles", censorCol="censor", featuresCol="features", labelCol="label"):
  - quantileProbabilities: The probabilities at which we want to predict the time-to-event (failure or maintenance).
  - quantilesCol: The name of the column where the quantiles will be stored in the output.
  - censorCol: The name of the column indicating whether the event was censored (1 for occurred, 0 for censored).
  - featuresCol: The name of the column containing the feature vector (e.g., machine age, temperature).
  - labelCol: The name of the column representing the label (time-to-event or failure).
- .fit(training): Fits the model to the provided training data.
'''

aft = AFTSurvivalRegression(
    quantileProbabilities=quantileProbabilities,
    quantilesCol="quantiles",
    censorCol="censor",
    featuresCol="features",
    labelCol="label"
)

model = aft.fit(training)


25/02/13 04:01:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [27]:

# Print the coefficients, intercept and scale parameter for AFT survival regression

In [29]:
'''
Print the model parameters:
- model.coefficients: Displays the coefficients of the model, which represent the weights of each feature in the model.
- model.intercept: Displays the intercept of the model, which is the bias term in the regression.
- model.scale: Displays the scale parameter, which is used in the survival regression model to model the distribution of the event times.
'''

print("Coefficients: {}".format(model.coefficients))
print("Intercept: {}".format(model.intercept))
print("Scale: {}".format(model.scale))



Coefficients: [-0.4963068060199874,0.1984439397592834]
Intercept: 2.6380905631560227
Scale: 1.5472326865488455


In [31]:
'''
transform the data based on model
prediction = time unit to fail when censor = 1 uncensored prediction = time 
unit to other event such as maintenance when censor = 0 (Censored) 1st 
element of quantiles = time unit at 30% chance 2nd element of
quantiles = time unit at 60% chance
'''

'\ntransform the data based on model\nprediction = time unit to fail when censor = 1 uncensored prediction = time \nunit to other event such as maintenance when censor = 0 (Censored) 1st \nelement of quantiles = time unit at 30% chance 2nd element of\nquantiles = time unit at 60% chance\n'

In [33]:
'''
Transform the training data using the trained model:
- .transform(training): Applies the trained model to the training data. It generates predictions and quantiles based on the model's calculations.
- .show(truncate=False): Displays the resulting DataFrame with predictions and quantiles without truncating the values for better readability.
'''

model.transform(training).show(truncate=False)


+-----+------+--------------+------------------+---------------------------------------+
|label|censor|features      |prediction        |quantiles                              |
+-----+------+--------------+------------------+---------------------------------------+
|1.218|1.0   |[1.56,-0.605] |5.7189965530299   |[1.1603295951029091,20.78508418847804] |
|2.949|0.0   |[0.346,2.158] |18.076458028588913|[3.6675401061563893,65.69696247756175] |
|3.627|0.0   |[1.38,0.231]  |7.3818753657635   |[1.497711770733379,26.828640220976947] |
|0.273|1.0   |[0.52,1.151]  |13.577581299077895|[2.754761130759772,49.3462739066917]   |
|4.199|0.0   |[0.795,-0.226]|9.013093216625732 |[1.8286702406091546,32.757127857843415]|
+-----+------+--------------+------------------+---------------------------------------+

