In [1]:
from ucimlrepo import fetch_ucirepo 
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.sql.functions import monotonically_increasing_id
import os


In [2]:
spark_master = os.environ.get("SPARK_MASTER_URL")

In [3]:
spark = SparkSession.builder \
    .appName("Explore Spark Session") \
    .master(spark_master) \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/21 06:18:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
X = spark.createDataFrame(default_of_credit_card_clients.data.features)
y = spark.createDataFrame(default_of_credit_card_clients.data.targets)
X = X.withColumn("id", monotonically_increasing_id())
y = y.withColumn("id", monotonically_increasing_id())
X_y = X.join(y,on="id",how="inner")


In [5]:
# Chi Squared Test for categorical features VS Target
vecAssembler = VectorAssembler(inputCols=["X2","X3","X4","X5","X6","X7","X8","X9","X10","X11"] \
                               ,outputCol="features")
chiSqResult = ChiSquareTest.test(vecAssembler.transform(X_y), "features", "Y").head()
print("p-values",chiSqResult.pValues)
print("degrees of freedom",chiSqResult.degreesOfFreedom)
print("statistic",chiSqResult.statistics)


[Stage 10:>                                                         (0 + 1) / 1]

p-values [4.472755499307368e-12,0.0,8.825862463091028e-08,5.643041589564746e-12,0.0,0.0,0.0,0.0,0.0,0.0]
degrees of freedom [1, 6, 3, 55, 10, 10, 10, 10, 9, 9]
statistic [47.90543311657916,163.21655786997073,35.66239583433609,158.55290013282715,5365.964977413581,3474.4667904168564,2622.462127682802,2341.469945438205,2197.694900930992,1886.8353090011867]


                                                                                

In [6]:
chiSqResult

Row(pValues=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), degreesOfFreedom=[1, 6, 3, 55, 10, 10, 10, 10, 9, 9], statistics=DenseVector([47.9054, 163.2166, 35.6624, 158.5529, 5365.965, 3474.4668, 2622.4621, 2341.4699, 2197.6949, 1886.8353]))

# Credit Data Features

- **X1: Amount of the given credit (NT dollar):**  
  It includes both the individual consumer credit and his/her family (supplementary) credit.
  
- **X2: Gender:**  
  - 1 = male  
  - 2 = female

- **X3: Education:**  
  - 1 = graduate school  
  - 2 = university  
  - 3 = high school  
  - 4 = others

- **X4: Marital status:**  
  - 1 = married  
  - 2 = single  
  - 3 = others

- **X5: Age (year)**

- **X6 - X11: History of past payment:**  
  These represent the past monthly payment records from April to September 2005.  
  - **X6:** Repayment status in September 2005  
  - **X7:** Repayment status in August 2005  
  - **X8:** Repayment status in July 2005  
  - **X9:** Repayment status in June 2005  
  - **X10:** Repayment status in May 2005  
  - **X11:** Repayment status in April 2005  

  **Measurement Scale for Repayment Status:**  
  - -1 = pay duly  
  - 1 = payment delay for one month  
  - 2 = payment delay for two months  
  - ...  
  - 8 = payment delay for eight months  
  - 9 = payment delay for nine months and above

- **X12 - X17: Amount of bill statement (NT dollar):**  
  - **X12:** Amount of bill statement in September 2005  
  - **X13:** Amount of bill statement in August 2005  
  - **X14:** Amount of bill statement in July 2005  
  - **X15:** Amount of bill statement in June 2005  
  - **X16:** Amount of bill statement in May 2005  
  - **X17:** Amount of bill statement in April 2005

- **X18 - X23: Amount of previous payment (NT dollar):**  
  - **X18:** Amount paid in September 2005  
  - **X19:** Amount paid in August 2005  
  - **X20:** Amount paid in July 2005  
  - **X21:** Amount paid in June 2005  
  - **X22:** Amount paid in May 2005  
  - **X23:** Amount paid in April 2005


1. Deal with imbalanced categorical features
2. Prepare data for cross-validation