In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, count, lit, min, max, mean, stddev
from pyspark.sql.functions import monotonically_increasing_id

from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, NaiveBayes, MultilayerPerceptronClassifier
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.classification import LinearSVC

import warnings
warnings.filterwarnings("ignore")
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

from cdc_module import plot_models_results
from training_module import *
from ucimlrepo import fetch_ucirepo 
from imblearn.over_sampling import SMOTE
from cdc_module import clean_data

%matplotlib inline


# **1.Split the datasets**

In [2]:
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets
X, y = clean_data(X, y)

In [3]:
spark = SparkSession.builder \
    .appName("CDC Diabetes Health Indicators") \
    .config("spark.driver.memory", "12g")\
    .master("local[*]") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
X_pyspark = spark.createDataFrame(X)
y_pyspark = spark.createDataFrame(y)

24/11/07 14:20:53 WARN Utils: Your hostname, lyudmil-ROG-Zephyrus-M16-GU603ZX-GU603ZX resolves to a loopback address: 127.0.1.1; using 192.168.0.229 instead (on interface wlo1)
24/11/07 14:20:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/07 14:20:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/07 14:20:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
X_pyspark = X_pyspark.withColumn('Diet', F.col('Fruits') + F.col('Veggies'))
X_pyspark = X_pyspark.withColumn('cardiovascular', F.col('HighChol')  + F.col('HighBP'))
X_pyspark = X_pyspark.withColumn('unhealthy_behavior', F.col('Smoker') + F.col('HvyAlcoholConsump'))
X_pyspark = X_pyspark.withColumn('healthcare',
    F.when((F.col('AnyHealthcare') == 1) & (F.col('NoDocbcCost') == 0), 3)
     .when((F.col('AnyHealthcare') == 1) & (F.col('NoDocbcCost') == 1), 2)
     .when((F.col('AnyHealthcare') == 0) & (F.col('NoDocbcCost') == 0), 1)
     .when((F.col('AnyHealthcare') == 0) & (F.col('NoDocbcCost') == 1), 0)
     .otherwise(-1))

In [5]:
min_max_features = ["GenHlth", "Age", "Education", "Income", "BMI", "MentHlth", "PhysHlth", "Diet", "cardiovascular", "unhealthy_behavior", "healthcare"]

for feature in min_max_features:
    min_val = X_pyspark.agg(min(col(feature))).collect()[0][0]
    max_val = X_pyspark.agg(max(col(feature))).collect()[0][0]
    X_pyspark = X_pyspark.withColumn(
        feature,
        (col(feature) - min_val) / (max_val - min_val)
    )

                                                                                

In [6]:
# Initialize the SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to upsample the minority class
X_balanced, y_balanced = smote.fit_resample(X_pyspark.toPandas(), y_pyspark.toPandas())

# Check the new class distribution
y_balanced_counts = y_balanced.value_counts()
print(y_balanced_counts)

Diabetes_binary
0                  192811
1                  192811
Name: count, dtype: int64


In [9]:
X_explained_variance_components_features = ['Fruits','HighChol','HighBP','PhysActivity','Smoker','HeartDiseaseorAttack','Stroke','CholCheck','BMI','Diet','cardiovascular','unhealthy_behavior','healthcare']
X_no_compound = ["Fruits", "HighChol", "HighBP", "PhysActivity", "Smoker", "Veggies", "HeartDiseaseorAttack", "Stroke", "CholCheck", "BMI"]
X_elbow_method_components_features = ["HighChol", "Smoker", "HighBP", "Stroke", "CholCheck", "BMI", "Diet", "cardiovascular", "unhealthy_behavior", "healthcare"]

## 1. unbalanced dataset

In [10]:
X_explained = X_pyspark[X_explained_variance_components_features]

X_elbow = X_pyspark[X_elbow_method_components_features]

X_no_compounds = X_pyspark[X_no_compound]

# save X_explained_variance_components
X_explained.write.mode('overwrite').csv('./X_explained_variance_components_ub.csv', header=True)

# save X_elbow_method_components
X_elbow.write.mode('overwrite').csv('./X_elbow_method_components_ub.csv', header=True)

# save X_no_compound_features
X_no_compounds.write.mode('overwrite').csv('./X_no_compound_features_ub.csv', header=True)

y_pyspark.write.mode('overwrite').csv('./y_pyspark_ub.csv', header=True)

## 2. upsampled dataset

In [11]:
# 读取 X_explained_variance_components.csv
X_explained_variance_components = X_balanced[X_explained_variance_components_features]

# 读取 X_elbow_method_components.csv
X_elbow_method_components = X_balanced[X_elbow_method_components_features]

# 读取 X_no_compound_features.csv
X_no_compound_features = X_balanced[X_no_compound]


X_explained = spark.createDataFrame(X_explained_variance_components)
X_elbow = spark.createDataFrame(X_elbow_method_components)
X_no_compounds = spark.createDataFrame(X_no_compound_features)
y_pyspark = spark.createDataFrame(y_balanced)

# save X_explained_variance_components
X_explained.write.mode('overwrite').csv('./X_explained_variance_components_up.csv', header=True)

# save X_elbow_method_components
X_elbow.write.mode('overwrite').csv('./X_elbow_method_components_up.csv', header=True)

# save X_no_compound_features
X_no_compounds.write.mode('overwrite').csv('./X_no_compound_features_up.csv', header=True)

y_pyspark.write.mode('overwrite').csv('./y_pyspark_up.csv', header=True)