In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('myproj').getOrCreate()
data = spark.read.csv('/FileStore/tables/UCI_Credit_Card.csv',inferSchema=True,header=True)
data.printSchema()

Data Dictionary
1.	ID: ID of each client
2.	LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit)
3.	SEX: Gender (1=male, 2=female)
4.	EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
5.	MARRIAGE: Marital status (1=married, 2=single, 3=others)
6.	Age: Age in years
7.	PAY_0 ------ PAY_6 (6 features): Repayment status from April, 2005 to September, 2005(-1=pay duly, 0=not delay, other number = the number of months for payment delay)
8.	BILL_AMT1 ------ BILL_AMT6 (6 features): Amount of bill statement from April, 2005 to September 2005(NT dollar)
9.	PAY_AMT1: Amount of previous payment from April, 2005 to September 2005(NT dollar)
10.	Default. Payment. next. Month: default payment (1=yes, 2=no)

In [0]:
data = data.withColumnRenamed("PAY_0", "PAY_1")
data = data.withColumnRenamed("default.payment.next.month", "Default")

Change data types

In [0]:
from pyspark.sql.types import StructField,StringType,IntegerType,DoubleType,StructType
# if the second element is True means it could have null cells
df_schema = StructType([
    StructField('ID', StringType(), True),
    StructField('LIMIT_BAL', DoubleType(), True),
    StructField('SEX', StringType(), True),
    StructField('EDUCATION', StringType(), True),
    StructField('MARRIAGE', StringType(), True),
    StructField('AGE', IntegerType(), True),
    StructField('PAY_1', IntegerType(), True),
    StructField('PAY_2', IntegerType(), True),
    StructField('PAY_3', IntegerType(), True),
    StructField('PAY_4', IntegerType(), True),
    StructField('PAY_5', IntegerType(), True),
    StructField('PAY_6', IntegerType(), True),
    StructField('BILL_AMT1', DoubleType(), True),
    StructField('BILL_AMT2', DoubleType(), True),
    StructField('BILL_AMT3', DoubleType(), True),
    StructField('BILL_AMT4', DoubleType(), True),
    StructField('BILL_AMT5', DoubleType(), True),
    StructField('BILL_AMT6', DoubleType(), True),
    StructField('PAY_AMT1', DoubleType(), True),
    StructField('PAY_AMT2', DoubleType(), True),
    StructField('PAY_AMT3', DoubleType(), True),
    StructField('PAY_AMT4', DoubleType(), True),
    StructField('PAY_AMT5', DoubleType(), True),
    StructField('PAY_AMT6', DoubleType(), True),
    StructField('Default', IntegerType(), True),
    ])
data = spark.createDataFrame(data.collect(),schema = df_schema)
data.printSchema()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import when, count, col
df = data
na_report=df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])# dimensions of the dataframe
print("Number of Rows: ",df.count() ,"   Number of Columns: ", len(df.columns))
# we should handle the missing values by imputation unless too many of them are emtpy
# na.drop() method is not recommended for avoiding biased except for the dependent variable
na_report.show()

In [0]:
df.describe().show()

Drop abnormal data in "EDUCATION" and "MARRIAGE" comlumns

In [0]:
df.select('MARRIAGE','EDUCATION').describe().show()

In [0]:
df = df.filter((df.MARRIAGE != '3')&(df.MARRIAGE != '0') &(df.EDUCATION != '0') &  (df.EDUCATION != '5') & (df.EDUCATION != '6'))

In [0]:
df.select('MARRIAGE','EDUCATION').describe().show()

Visualizations

In [0]:
df.select("EDUCATION","Default","ID").groupBy("EDUCATION","Default").agg(count("ID")).orderBy('EDUCATION').display()

EDUCATION,Default,count(ID)
1,0,8508
1,1,2023
2,0,10577
2,1,3285
3,1,1206
3,0,3564
4,0,113
4,1,7
