### To-Do List

- [ ] Cross-validation pipeline
- [ ] Chi-Squared Test for categorical variables
- [ ] PCA
- [ ] Data Bias


In [10]:
from ucimlrepo import fetch_ucirepo 
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.sql.functions import monotonically_increasing_id
import os
import logging


In [11]:
# initialize spark session
spark_master = os.environ.get("SPARK_MASTER_URL")
spark = SparkSession.builder \
    .appName("Explore-Spark-Session") \
    .master(spark_master) \
    .getOrCreate()

logger = logging.getLogger("py4j")
logger.setLevel(logging.ERROR)


In [12]:
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
X = spark.createDataFrame(default_of_credit_card_clients.data.features)
y = spark.createDataFrame(default_of_credit_card_clients.data.targets)

# combine X and y
X = X.withColumn("id", monotonically_increasing_id())
y = y.withColumn("id", monotonically_increasing_id())
df = X.join(y, on="id", how="inner").drop("id")

# split training and testing data
train, test = df.randomSplit([.7,.3], seed=42)


In [5]:
# # Chi Squared Test for categorical features VS Target
# vecAssembler = VectorAssembler(inputCols=["X2","X3","X4","X5","X6","X7","X8","X9","X10","X11"] \
#                                ,outputCol="features")
# chiSqResult = ChiSquareTest.test(vecAssembler.transform(X_y), "features", "Y").head()
# print("p-values",chiSqResult.pValues)
# print("degrees of freedom",chiSqResult.degreesOfFreedom)
# print("statistic",chiSqResult.statistics)
# print(chiSqResult)


In [14]:
spark.stop()

# Credit Data Features

- **X1: Amount of the given credit (NT dollar):**  
  It includes both the individual consumer credit and his/her family (supplementary) credit.
  
- **X2: Gender:**  
  - 1 = male  
  - 2 = female

- **X3: Education:**  
  - 1 = graduate school  
  - 2 = university  
  - 3 = high school  
  - 4 = others

- **X4: Marital status:**  
  - 1 = married  
  - 2 = single  
  - 3 = others

- **X5: Age (year)**

- **X6 - X11: History of past payment:**  
  These represent the past monthly payment records from April to September 2005.  
  - **X6:** Repayment status in September 2005  
  - **X7:** Repayment status in August 2005  
  - **X8:** Repayment status in July 2005  
  - **X9:** Repayment status in June 2005  
  - **X10:** Repayment status in May 2005  
  - **X11:** Repayment status in April 2005  

  **Measurement Scale for Repayment Status:**  
  - -1 = pay duly  
  - 1 = payment delay for one month  
  - 2 = payment delay for two months  
  - ...  
  - 8 = payment delay for eight months  
  - 9 = payment delay for nine months and above

- **X12 - X17: Amount of bill statement (NT dollar):**  
  - **X12:** Amount of bill statement in September 2005  
  - **X13:** Amount of bill statement in August 2005  
  - **X14:** Amount of bill statement in July 2005  
  - **X15:** Amount of bill statement in June 2005  
  - **X16:** Amount of bill statement in May 2005  
  - **X17:** Amount of bill statement in April 2005

- **X18 - X23: Amount of previous payment (NT dollar):**  
  - **X18:** Amount paid in September 2005  
  - **X19:** Amount paid in August 2005  
  - **X20:** Amount paid in July 2005  
  - **X21:** Amount paid in June 2005  
  - **X22:** Amount paid in May 2005  
  - **X23:** Amount paid in April 2005
