# Data Formatting and Transformations

In [1]:
# First let's create our PySpark instance
# import findspark
# findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("Classification").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark
# Click the hyperlinked "Spark UI" link to view details about your Spark session

You are working with 1 core(s)


In [2]:
path = "../../../data/Toddler Autism dataset July 2018.csv"

df = spark.read.csv(path,inferSchema=True,header=True)

In [3]:
df.limit(6).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes
5,6,1,1,0,0,1,1,1,1,1,1,21,8,m,black,no,no,family member,Yes


## What we want to do is...

- Imbalance check
- Labeling
- StringIndexing


In [4]:
df.printSchema()


root
 |-- Case_No: integer (nullable = true)
 |-- A1: integer (nullable = true)
 |-- A2: integer (nullable = true)
 |-- A3: integer (nullable = true)
 |-- A4: integer (nullable = true)
 |-- A5: integer (nullable = true)
 |-- A6: integer (nullable = true)
 |-- A7: integer (nullable = true)
 |-- A8: integer (nullable = true)
 |-- A9: integer (nullable = true)
 |-- A10: integer (nullable = true)
 |-- Age_Mons: integer (nullable = true)
 |-- Qchat-10-Score: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- Jaundice: string (nullable = true)
 |-- Family_mem_with_ASD: string (nullable = true)
 |-- Who completed the test: string (nullable = true)
 |-- Class/ASD Traits : string (nullable = true)



In [5]:
#imbalance check

df.groupBy(df["Class/ASD Traits "]).count().show()

+-----------------+-----+
|Class/ASD Traits |count|
+-----------------+-----+
|               No|  326|
|              Yes|  728|
+-----------------+-----+



In [6]:
#labeling
# Declare values you will need
input_columns = df.columns
input_columns = input_columns[1:-1]

dependent_var = "Class/ASD Traits "

In [7]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *

renamed = df.withColumn("label_str",df[dependent_var].cast(StringType()))

In [8]:
indexer = StringIndexer(inputCol="label_str",outputCol="label")
indexed = indexer.fit(renamed).transform(renamed)
#fit -> transform
#frequent label gets lower number(0)


In [9]:
indexed.limit(3).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,label_str,label
0,1,0,0,0,0,0,0,1,1,0,...,28,3,f,middle eastern,yes,no,family member,No,No,1.0
1,2,1,1,0,0,0,1,1,0,0,...,36,4,m,White European,yes,no,family member,Yes,Yes,0.0
2,3,1,0,0,0,0,0,1,1,0,...,36,4,m,middle eastern,yes,no,family member,Yes,Yes,0.0


In [10]:
#Conversion of data of which its type is string to index
numeric_inputs= []
string_inputs=[]

for column in input_columns:
    if str(indexed.schema[column].dataType) == "StringType":
        indexer = StringIndexer(inputCol=column,outputCol=column+"_num")
        indexed = indexer.fit(indexed).transform(indexed)
        new_col_name = column+"_num"
        string_inputs.append(new_col_name)
    else:
        numeric_inputs.append(column)

In [11]:
indexed.printSchema()

root
 |-- Case_No: integer (nullable = true)
 |-- A1: integer (nullable = true)
 |-- A2: integer (nullable = true)
 |-- A3: integer (nullable = true)
 |-- A4: integer (nullable = true)
 |-- A5: integer (nullable = true)
 |-- A6: integer (nullable = true)
 |-- A7: integer (nullable = true)
 |-- A8: integer (nullable = true)
 |-- A9: integer (nullable = true)
 |-- A10: integer (nullable = true)
 |-- Age_Mons: integer (nullable = true)
 |-- Qchat-10-Score: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- Jaundice: string (nullable = true)
 |-- Family_mem_with_ASD: string (nullable = true)
 |-- Who completed the test: string (nullable = true)
 |-- Class/ASD Traits : string (nullable = true)
 |-- label_str: string (nullable = true)
 |-- label: double (nullable = false)
 |-- Sex_num: double (nullable = false)
 |-- Ethnicity_num: double (nullable = false)
 |-- Jaundice_num: double (nullable = false)
 |-- Family_mem_with_ASD_num: double

In [12]:
indexed.limit(3).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,label_str,label,Sex_num,Ethnicity_num,Jaundice_num,Family_mem_with_ASD_num,Who completed the test_num
0,1,0,0,0,0,0,0,1,1,0,...,no,family member,No,No,1.0,1.0,2.0,1.0,0.0,0.0
1,2,1,1,0,0,0,1,1,0,0,...,no,family member,Yes,Yes,0.0,0.0,0.0,1.0,0.0,0.0
2,3,1,0,0,0,0,0,1,1,0,...,no,family member,Yes,Yes,0.0,0.0,2.0,1.0,0.0,0.0


# Treating for skewness and outliers

- If skewness is less than -1 or greater than 1, the distribution is highly skewed.
- If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.
- If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.

## Best practice for treating skewness
- Flooring and capping
- for right skew(positive), take the log +1
- for left skew(negative), do exp transformation

In [13]:
from pyspark.sql.functions import *

#skewness function!
indexed.agg(skewness(indexed["A3"]))

DataFrame[skewness(A3): double]

In [14]:
#create empty dictionary
d = {}

for col in numeric_inputs:
    d[col] = indexed.approxQuantile(col,[0.01,0.99],0.25) 
    #The last parameter is a relative target precision to achieve (>= 0).
    #If set to zero, the exact quantiles are computed, which could be very expensive.
d

{'A1': [0.0, 1.0],
 'A2': [0.0, 1.0],
 'A3': [0.0, 1.0],
 'A4': [0.0, 1.0],
 'A5': [0.0, 1.0],
 'A6': [0.0, 1.0],
 'A7': [0.0, 1.0],
 'A8': [0.0, 1.0],
 'A9': [0.0, 1.0],
 'A10': [0.0, 1.0],
 'Age_Mons': [12.0, 36.0],
 'Qchat-10-Score': [0.0, 10.0]}

In [15]:
for col in numeric_inputs:
    skew = indexed.agg(skewness(indexed[col])).collect()
    #for example, 
    #indexed.agg(skewness(indexed['A3'])).collect() returns
    #[Row(skewness(A3)=0.40260435804137373)]
    skew = skew[0][0]
    print(skew)
    if skew > 1: #If right skew, floor, cap and log(x+1)
        indexed = indexed.withColumn(col,\
          log(when(indexed[col]<d[col][0],d[col][0])\
             .when(indexed[col] > d[col][1],d[col][1])\
            .otherwise(indexed[col]) + 1).alias(col))     

        print(col," has been treated for positive skewness. (skew =)",skew,")")
    elif skew <-1:
        indexed = indexed.withColumn(col,\
                exp(when(indexed[col]<d[col][0],d[col][0])\
                   .when(indexed[col]> d[col][1],d[col][1])\
                    .otherwise(indexed[col])).alias(col))
        print(col," has been treated for negative skewness. (skew =)",skew,")")
        

-0.2563496093947895
0.20601797730122015
0.40260435804137373
-0.04935088083712881
-0.09879203126218447
-0.3110969802658277
-0.6285337660559676
0.1637338005140266
0.041754827339823934
-0.3506177559444581
-0.6341451203248881
-0.08006349388828135


#  Negative values in dataframe

- we need to check for negative values in the dataframe.
- and produce a warning if there are negative values because otherwise you can't use native Bayes.

In [16]:
# Calculate the mins for all columns in the df
minimums = indexed.select([min(c).alias(c) for c in indexed.columns if c in numeric_inputs])

#pyspark.sql.functions.array -> gives you vetorized values
#like, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0]
min_array= minimums.select(array(numeric_inputs).alias("mins"))

#array_min returns the minimum element in an array value.
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0] - > 0
df_minimum = min_array.select(array_min(min_array.mins)).collect()
df_minimum = df_minimum[0][0]