In [None]:
%sql
SELECT * FROM `oasis_ml_ws`.`default`.`oasis_longitudinal_demographics_oas2_2_csv`;

Subject_ID,MRI_ID,Group,Visit,MR_Delay,Gender,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034
OAS2_0004,OAS2_0004_MR1,Nondemented,1,0,F,R,88,18,3.0,28.0,0.0,1215,0.71,1.444
OAS2_0004,OAS2_0004_MR2,Nondemented,2,538,F,R,90,18,3.0,27.0,0.0,1200,0.718,1.462
OAS2_0005,OAS2_0005_MR1,Nondemented,1,0,M,R,80,12,4.0,28.0,0.0,1689,0.712,1.039
OAS2_0005,OAS2_0005_MR2,Nondemented,2,1010,M,R,83,12,4.0,29.0,0.5,1701,0.711,1.032
OAS2_0005,OAS2_0005_MR3,Nondemented,3,1603,M,R,85,12,4.0,30.0,0.0,1699,0.705,1.033


Read the file into a dataframe.

In [None]:
df = spark.read.format("csv").option("header", "true").load("/FileStore/tables/oasis_longitudinal_demographics_OAS2-2.csv")

Remove null values from dataframe.

In [None]:
df_without_nulls = df.na.drop()
df_without_nulls.show()

+----------+-------------+-----------+-----+--------+---+----+---+----+---+----+---+----+-----+-----+
|Subject ID|       MRI ID|      Group|Visit|MR Delay|M/F|Hand|Age|EDUC|SES|MMSE|CDR|eTIV| nWBV|  ASF|
+----------+-------------+-----------+-----+--------+---+----+---+----+---+----+---+----+-----+-----+
| OAS2_0001|OAS2_0001_MR1|Nondemented|    1|       0|  M|   R| 87|  14|  2|  27|  0|1987|0.696|0.883|
| OAS2_0001|OAS2_0001_MR2|Nondemented|    2|     457|  M|   R| 88|  14|  2|  30|  0|2004|0.681|0.876|
| OAS2_0004|OAS2_0004_MR1|Nondemented|    1|       0|  F|   R| 88|  18|  3|  28|  0|1215|0.710|1.444|
| OAS2_0004|OAS2_0004_MR2|Nondemented|    2|     538|  F|   R| 90|  18|  3|  27|  0|1200|0.718|1.462|
| OAS2_0005|OAS2_0005_MR1|Nondemented|    1|       0|  M|   R| 80|  12|  4|  28|  0|1689|0.712|1.039|
| OAS2_0005|OAS2_0005_MR2|Nondemented|    2|    1010|  M|   R| 83|  12|  4|  29|0.5|1701|0.711|1.032|
| OAS2_0005|OAS2_0005_MR3|Nondemented|    3|    1603|  M|   R| 85|  12|  4|  30|  

Split data 80/20 for training data and testing data for ML model. 

In [None]:
split_ratio = [0.8, 0.2]
training_data, test_data = df_without_nulls.randomSplit(split_ratio, seed=42)

training_data.show()
test_data.show()

+----------+-------------+-----------+-----+--------+---+----+---+----+---+----+---+----+-----+-----+
|Subject ID|       MRI ID|      Group|Visit|MR Delay|M/F|Hand|Age|EDUC|SES|MMSE|CDR|eTIV| nWBV|  ASF|
+----------+-------------+-----------+-----+--------+---+----+---+----+---+----+---+----+-----+-----+
| OAS2_0001|OAS2_0001_MR1|Nondemented|    1|       0|  M|   R| 87|  14|  2|  27|  0|1987|0.696|0.883|
| OAS2_0001|OAS2_0001_MR2|Nondemented|    2|     457|  M|   R| 88|  14|  2|  30|  0|2004|0.681|0.876|
| OAS2_0004|OAS2_0004_MR2|Nondemented|    2|     538|  F|   R| 90|  18|  3|  27|  0|1200|0.718|1.462|
| OAS2_0005|OAS2_0005_MR1|Nondemented|    1|       0|  M|   R| 80|  12|  4|  28|  0|1689|0.712|1.039|
| OAS2_0005|OAS2_0005_MR2|Nondemented|    2|    1010|  M|   R| 83|  12|  4|  29|0.5|1701|0.711|1.032|
| OAS2_0008|OAS2_0008_MR1|Nondemented|    1|       0|  F|   R| 93|  14|  2|  30|  0|1272|0.698|1.380|
| OAS2_0009|OAS2_0009_MR1|   Demented|    1|       0|  M|   R| 68|  12|  2|  27|0.

Prep for Machine Learning

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when

# Step 1: Create a DataFrame with the desired columns
selected_columns = ['M/F', 'MMSE', 'eTIV', 'nWBV', 'ASF', 'Group']
df_selected = df.select(selected_columns)

# Step 2: Convert the columns to the correct data type
df_selected = df_selected.withColumn('MMSE', col('MMSE').cast('float'))
df_selected = df_selected.withColumn('eTIV', col('eTIV').cast('float'))
df_selected = df_selected.withColumn('nWBV', col('nWBV').cast('float'))
df_selected = df_selected.withColumn('ASF', col('ASF').cast('float'))

# Step 3: Fill null values in numeric columns 
numeric_columns = ['MMSE', 'eTIV', 'nWBV', 'ASF']
imputer = Imputer(inputCols=numeric_columns, outputCols=numeric_columns)
df_selected = imputer.fit(df_selected).transform(df_selected)

# Step 4: Convert the categorical column 'M/F' to numeric
indexer = StringIndexer(inputCol='M/F', outputCol='gender_index')

# Step 5: Convert the 'Group' column to numeric
df_selected = df_selected.withColumn('Group', when(col('Group') == 'demented', 1).otherwise(0))

# Step 6: Assemble the features into a vector column
assembler = VectorAssembler(inputCols=['gender_index', 'MMSE', 'eTIV', 'nWBV', 'ASF'], outputCol='features')

# Step 7: Split the data into training and test sets
train_data, test_data = df_selected.randomSplit([0.8, 0.2])

# Step 8: Create a RandomForestClassifier instance
rf = RandomForestClassifier(labelCol='Group', featuresCol='features')

# Step 9: Build the pipeline and fit the model on the training data
pipeline = Pipeline(stages=[indexer, assembler, rf])
model = pipeline.fit(train_data)

# Step 10: Make predictions on the test data
predictions = model.transform(test_data)

In [None]:
df_selected.show()
train_data.show()
test_data.show()
predictions.show()

+---+----+------+-----+-----+-----+
|M/F|MMSE|  eTIV| nWBV|  ASF|Group|
+---+----+------+-----+-----+-----+
|  M|27.0|1987.0|0.696|0.883|    0|
|  M|30.0|2004.0|0.681|0.876|    0|
|  M|23.0|1678.0|0.736|1.046|    0|
|  M|28.0|1738.0|0.713| 1.01|    0|
|  M|22.0|1698.0|0.701|1.034|    0|
|  F|28.0|1215.0| 0.71|1.444|    0|
|  F|27.0|1200.0|0.718|1.462|    0|
|  M|28.0|1689.0|0.712|1.039|    0|
|  M|29.0|1701.0|0.711|1.032|    0|
|  M|30.0|1699.0|0.705|1.033|    0|
|  M|28.0|1357.0|0.748|1.293|    0|
|  M|27.0|1365.0|0.727|1.286|    0|
|  M|27.0|1372.0| 0.71|1.279|    0|
|  F|30.0|1272.0|0.698| 1.38|    0|
|  F|29.0|1257.0|0.703|1.396|    0|
|  M|27.0|1457.0|0.806|1.205|    0|
|  M|24.0|1480.0|0.791|1.186|    0|
|  F|30.0|1447.0|0.769|1.213|    0|
|  F|29.0|1482.0|0.752|1.184|    0|
|  F|29.0|1333.0|0.748|1.316|    0|
+---+----+------+-----+-----+-----+
only showing top 20 rows

+---+----+------+-----+-----+-----+
|M/F|MMSE|  eTIV| nWBV|  ASF|Group|
+---+----+------+-----+-----+-----+
| 

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 11: Evaluate the model's accuracy on the test data
evaluator = MulticlassClassificationEvaluator(labelCol='Group', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy Rate: {accuracy * 100:.2f}%")

Accuracy Rate: 100.00%


Test Trained model on random dataset for accuracy and predictions. 

In [None]:
# Load the test data from the CSV file

test_data_random = spark.read.csv('/FileStore/tables/oasis_longitudinal_demographics_OAS2-2.csv', header=True, inferSchema=True)

In [None]:
# Transform the test data to match the format expected by the trained model

selected_features = ['M/F', 'MMSE', 'eTIV', 'nWBV', 'ASF']
test_data_selected = test_data.select(selected_features)

In [None]:
# Make predictions using the trained model

predictions = model.transform(test_data_selected)

In [None]:
# Display the results

predictions.show()

+---+----+----+-----+-----+------------+--------------------+-------------+-----------+----------+
|M/F|MMSE|eTIV| nWBV|  ASF|gender_index|            features|rawPrediction|probability|prediction|
+---+----+----+-----+-----+------------+--------------------+-------------+-----------+----------+
|  M|  27|1987|0.696|0.883|         1.0|[1.0,27.0,1987.0,...|       [20.0]|      [1.0]|       0.0|
|  M|  30|2004|0.681|0.876|         1.0|[1.0,30.0,2004.0,...|       [20.0]|      [1.0]|       0.0|
|  M|  23|1678|0.736|1.046|         1.0|[1.0,23.0,1678.0,...|       [20.0]|      [1.0]|       0.0|
|  M|  28|1738|0.713| 1.01|         1.0|[1.0,28.0,1738.0,...|       [20.0]|      [1.0]|       0.0|
|  M|  22|1698|0.701|1.034|         1.0|[1.0,22.0,1698.0,...|       [20.0]|      [1.0]|       0.0|
|  F|  28|1215| 0.71|1.444|         0.0|[0.0,28.0,1215.0,...|       [20.0]|      [1.0]|       0.0|
|  F|  27|1200|0.718|1.462|         0.0|[0.0,27.0,1200.0,...|       [20.0]|      [1.0]|       0.0|
|  M|  28|

In [None]:
model_location = "/FileSore/to/saved/Oasis.v1"
model.write().save(model_location)
print(f"Trained model location: {model_location}")

Trained model location: /FileSore/to/saved/Oasis.v1


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Assuming you already have a trained model called 'model' and a test data frame called 'test_data'

# Make predictions on the test data
predictions = model.transform(df_selected)

# Create an evaluator for accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="Group", predictionCol="prediction", metricName="accuracy")

# Calculate the accuracy
accuracy = evaluator.evaluate(predictions)

# Print the accuracy
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [None]:
predictions.show()

+---+----+------+-----+-----+-----+------------+--------------------+-------------+-----------+----------+
|M/F|MMSE|  eTIV| nWBV|  ASF|Group|gender_index|            features|rawPrediction|probability|prediction|
+---+----+------+-----+-----+-----+------------+--------------------+-------------+-----------+----------+
|  M|27.0|1987.0|0.696|0.883|    0|         1.0|[1.0,27.0,1987.0,...|       [20.0]|      [1.0]|       0.0|
|  M|30.0|2004.0|0.681|0.876|    0|         1.0|[1.0,30.0,2004.0,...|       [20.0]|      [1.0]|       0.0|
|  M|23.0|1678.0|0.736|1.046|    0|         1.0|[1.0,23.0,1678.0,...|       [20.0]|      [1.0]|       0.0|
|  M|28.0|1738.0|0.713| 1.01|    0|         1.0|[1.0,28.0,1738.0,...|       [20.0]|      [1.0]|       0.0|
|  M|22.0|1698.0|0.701|1.034|    0|         1.0|[1.0,22.0,1698.0,...|       [20.0]|      [1.0]|       0.0|
|  F|28.0|1215.0| 0.71|1.444|    0|         0.0|[0.0,28.0,1215.0,...|       [20.0]|      [1.0]|       0.0|
|  F|27.0|1200.0|0.718|1.462|    0|  