In [11]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
spark = SparkSession.builder.appName('722').getOrCreate()

In [2]:
# Import dataset by using pandas
import pandas as pd

crime_data_pd=pd.read_excel("Crime.xlsx")
education_data_pd=pd.read_excel("Education.xlsx")

#Transfer pandas dataframe to spark dataframe
crime_data= spark.createDataFrame(crime_data_pd)
education_data= spark.createDataFrame(education_data_pd)

In [3]:
#2 Data Understanding
#2.2 Data Decription
crime_data.show()
education_data.show()


                                                                                

+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+-----------------------------------------------------+------------------------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of harmful digital communication offense|Total amount of drugs offences cases|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+-----------------------------------------------------+------------------------------------+
|     Northland|2014|                                123|                                  692|                               4335|                                                  NaN|                                 339|
|     Northland|2015|                                138|                                  817|             

In [4]:
#2.3 Data exploration/Display value for each column

def group_and_sum(dataframe, columns):
    for column in columns:
        dataframe.groupBy("Region").sum(column).show()


crime_columns = [
    "Total amount of convicted juveniles",
    "Total amount of family violence cases",
    "Total amount of drugs offences cases",
    "Total amount of people with charges",
    "Total amount of harmful digital communication offense"
]

education_columns = [
    "Total amount of schools",
    "Total amount of Students",
    "Total amount of student attending regularly",
    "Participation in ECE(early childhood education)",
    "Mean household income"
]


group_and_sum(crime_data, crime_columns)
group_and_sum(education_data, education_columns)


+--------------------+----------------------------------------+
|              Region|sum(Total amount of convicted juveniles)|
+--------------------+----------------------------------------+
|          Wellington|                                     186|
|            Auckland|                                     765|
|             Waikato|                                    1221|
|      South Auckland|                                    1833|
|               Otago|                                     495|
|       Bay of Plenty|                                     846|
|          Canterbury|                                    1371|
|           Northland|                                     867|
|           Southland|                                     456|
|Nelson/Marlboroug...|                                     528|
|           Waitematā|                                    1113|
|  Taranaki/Whanganui|                                     738|
|            Waiariki|                  

+--------------------+------------------------------------------------+
|              Region|sum(Total amount of student attending regularly)|
+--------------------+------------------------------------------------+
|          Wellington|                              413768.33999999997|
|            Auckland|                              1286417.8450000002|
|             Waikato|                                      351773.806|
|      South Auckland|                               59098.83899999999|
|               Otago|                              165335.59900000002|
|       Bay of Plenty|                              255676.36899999998|
|          Canterbury|                              464899.91500000004|
|           Northland|                              114122.55500000001|
|           Southland|                                       85321.336|
|Nelson/Marlboroug...|                                       36133.513|
|           Waitematā|                                          

In [5]:
#2.4 Data Quality
#2.4.1 Calculate Missing Value
record_crime_data=crime_data_pd.isna()
record_education_data=education_data_pd.isna()

def counting_missing_value(dataframe):
    count = 0
    for column in dataframe.columns:
        count += dataframe[column].sum()
    return count

print(counting_missing_value(record_crime_data))
print(counting_missing_value(record_education_data))

  

16
157


In [6]:
#3 Data Preparation
#3.1 Data selection
#checking basic info for dataset and each column, crime dataset first
crime_data_pd.describe()



Unnamed: 0,year,Total amount of convicted juveniles,Total amount of family violence cases,Total amount of people with charges,Total amount of harmful digital communication offense,Total amount of drugs offences cases
count,144.0,144.0,144.0,144.0,128.0,144.0
mean,2018.0,97.979167,822.611111,4477.715278,5.015625,364.430556
std,2.591001,50.115736,292.376975,1996.337747,4.186563,156.66126
min,2014.0,9.0,298.0,1714.0,0.0,118.0
25%,2016.0,63.0,631.25,3027.25,2.0,243.5
50%,2018.0,91.5,816.0,3938.0,4.0,337.0
75%,2020.0,123.75,962.5,5755.75,7.0,465.0
max,2022.0,297.0,1765.0,11409.0,17.0,781.0


In [7]:
#checking basic info for dataset and each column, education dataset first
education_data_pd.describe()

Unnamed: 0,year,Total amount of schools,Total amount of Students,Total amount of student attending regularly,Mean household income,Participation in ECE(early childhood education)
count,144.0,108.0,108.0,144.0,95.0,108.0
mean,2018.0,182.583333,52216.333333,24485.014646,97417.645614,16171.768519
std,2.591001,93.899442,57362.680669,34992.301305,18537.215247,12294.976168
min,2014.0,83.0,6064.0,0.0,66248.0,3354.0
25%,2016.0,126.0,16431.5,2165.319,83349.0,7733.25
50%,2018.0,149.0,30226.5,12284.0995,94207.0,13851.5
75%,2020.0,214.5,67850.5,31092.1275,106600.0,19168.25
max,2022.0,433.0,244549.0,159032.139,151179.0,54040.0


In [8]:
#drop the unnecessary column
crime_data=crime_data.drop("Total amount of harmful digital communication offense")
crime_data.show()

+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of drugs offences cases|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+
|     Northland|2014|                                123|                                  692|                               4335|                                 339|
|     Northland|2015|                                138|                                  817|                               4528|                                 341|
|     Northland|2016|                                117|                                  772|                               4435|                        

In [30]:
#3.2 Data cleaning
#3.2.1 Missing value
def calculate_mean_values(dataframe, exclude_columns):
    dataframe = dataframe.fillna(0)
    mean_values = dataframe.agg(
        *[avg(col(column)).alias(column) for column in dataframe.columns if column not in exclude_columns]
    ).first().asDict()
    return mean_values

def replace_missing_values_with_mean(dataframe, exclude_columns):
    mean_values = calculate_mean_values(dataframe, exclude_columns)
    dataframe = dataframe.fillna(mean_values)
    return dataframe


exclude_columns=["Region","Year"]

crime_data=replace_missing_values_with_mean(crime_data, exclude_columns)
education_data=replace_missing_values_with_mean(education_data, exclude_columns)

crime_data_2_pd=crime_data.toPandas()
education_data_2_pd=education_data.toPandas()

record_crime_data_1=crime_data_2_pd.isna()
record_education_data_1=education_data_2_pd.isna()

print(counting_missing_value(record_crime_data_1))
print(counting_missing_value(record_education_data_1))


0
0


In [None]:
#3.2.1 Outliners and Extreme values，import approxQuantile() function to use quantile to define outliners and extreme values
from pyspark.sql.functions import col, expr

def calculate_quantiles(dataframe, quantiles, exclude_columns):
    quantile_values = {}
    for column in dataframe.columns:
        if column not in exclude_columns:
            quantiles_df = dataframe.approxQuantile(column, quantiles, 0.0)
            quantile_values[column] = quantiles_df
    return quantile_values

def replace_outliers_with_mean(dataframe, quantile_values, exclude_columns=[]):
    for column in dataframe.columns:
        if column not in exclude_columns:
            lower_bound = quantile_values[column][0]
            upper_bound = quantile_values[column][1]
            mean_value = dataframe.agg(avg(col(column))).first()[0]
            dataframe = dataframe.withColumn(
                column,
                expr(f"IF({column} < {lower_bound} OR {column} > {upper_bound}, {mean_value}, {column})")
            )
    return dataframe

