In [2]:
from pyspark.sql.types import FloatType,StringType,IntegerType,StructType
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,col

In [3]:
#Helper functions

def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL crime prediction functions") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

def getNullValues(data):
    spark = init_spark()
    rowsData = ()
    rowDataPorc=()
    rowDataNames = ()
    for i in data.columns:
        rowDataNames = rowDataNames + (i + "_null",)
        aux=data.where(col(i).isNull()).count()
        porAux=(aux/data.count())*100
        rowsData = rowsData + (float(aux),)
        rowDataPorc=rowDataPorc+(float(porAux),)
    nullCount = spark.createDataFrame([rowsData,rowDataPorc], rowDataNames)
    return nullCount

In [4]:
#Function to pre-process the Wellbeing datasets

def datasetWellbeing(demographicsFilename,economicsFilename,educationFilename, variable):

    spark = init_spark()

    #Selecting features from the dataframe

    eliminateSpace = udf(lambda x: x.replace(" ","-").lower(),StringType())
    porcentaj=udf(lambda x: x/100,FloatType())
    dataWellbeingDemographics = spark.read.csv(demographicsFilename, header=True, mode="DROPMALFORMED",inferSchema=True)
    dataWellbeingDemographics=dataWellbeingDemographics.select(eliminateSpace('Neighbourhood').alias('neighbourhood_d'),dataWellbeingDemographics['Neighbourhood Id'].alias("neighbourhood_id_d"),dataWellbeingDemographics['Total Area'].alias('total_area'),dataWellbeingDemographics['Total Population'].alias('total_population'))

    dataWellbeingEconomics = spark.read.csv(economicsFilename, header=True, mode="DROPMALFORMED",inferSchema=True)
    dataWellbeingEconomics = dataWellbeingEconomics.select(eliminateSpace('Neighbourhood').alias('neighbourhood_e'),
                                                                 dataWellbeingEconomics['Neighbourhood Id'].alias(
                                                                     "neighbourhood_id_e"),
                                                                 dataWellbeingEconomics['Home Prices'].alias(
                                                                     'home_prices'),
                                                                 dataWellbeingEconomics['Local Employment'].alias(
                                                                     'local_employment'),dataWellbeingEconomics['Social Assistance Recipients'].alias('social_assistance_recipients'))

    dataWellbeingEducation = spark.read.csv(educationFilename, header=True, mode="DROPMALFORMED",inferSchema=True)
    dataWellbeingEducation = dataWellbeingEducation.select(eliminateSpace('Neighbourhood').alias('neighbourhood_ed'),
                                                           dataWellbeingEducation['Neighbourhood Id'].alias(
                                                               "neighbourhood_id_ed"),
                                                           dataWellbeingEducation['Catholic School Graduation'].alias(
                                                               'catholic_school_graduation'),
                                                           porcentaj(dataWellbeingEducation['Catholic School Literacy']).alias(
                                                               'catholic_school_literacy'),
                                                           porcentaj(dataWellbeingEducation['Catholic University Applicants']).alias(
                                                               'catholic_university_applicants'))

    data = dataWellbeingEducation.join(dataWellbeingEconomics,dataWellbeingEducation['neighbourhood_id_ed']==dataWellbeingEconomics['neighbourhood_id_e'])
    data = data.join(dataWellbeingDemographics,
                                       data['neighbourhood_id_ed'] == dataWellbeingDemographics[
                                           'neighbourhood_id_d'])

    data=data.select(data['neighbourhood_e'].alias('neighbourhood_w'),data['neighbourhood_id_e'].alias('neighbourhood_id'),data['total_area'],data['total_population'],data['home_prices'],data['local_employment'],data['social_assistance_recipients'],data['catholic_school_graduation'],data['catholic_school_literacy'],data['catholic_university_applicants'])

    if(variable==1):

        # Show Schema
        data.printSchema()

        #Counting null values
        nullCount=getNullValues(data)
        nullCount.show()

        #Feature metrics
        data.describe().show()

    return data

datasetWellbeing("./data/WB-Demographics.csv","./data/WB-Economics.csv","./data/WB-Education.csv",1)

root
 |-- neighbourhood_w: string (nullable = true)
 |-- neighbourhood_id: integer (nullable = true)
 |-- total_area: double (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- home_prices: integer (nullable = true)
 |-- local_employment: integer (nullable = true)
 |-- social_assistance_recipients: integer (nullable = true)
 |-- catholic_school_graduation: double (nullable = true)
 |-- catholic_school_literacy: float (nullable = true)
 |-- catholic_university_applicants: float (nullable = true)

+--------------------+---------------------+---------------+---------------------+----------------+---------------------+---------------------------------+-------------------------------+-----------------------------+-----------------------------------+
|neighbourhood_w_null|neighbourhood_id_null|total_area_null|total_population_null|home_prices_null|local_employment_null|social_assistance_recipients_null|catholic_school_graduation_null|catholic_school_literacy_null|catholic

DataFrame[neighbourhood_w: string, neighbourhood_id: int, total_area: double, total_population: int, home_prices: int, local_employment: int, social_assistance_recipients: int, catholic_school_graduation: double, catholic_school_literacy: float, catholic_university_applicants: float]

In [5]:
#Function to pre-process the MCI dataset

def dataSetCreactionCrime(crimeFilename,variable):
    spark = init_spark()
    dataCrimes = spark.read.csv(crimeFilename, header=True, mode="DROPMALFORMED",inferSchema=True)

    if(variable==1):
        
        #Show Schema
        dataCrimes.printSchema()

        # Counting null values
        nullCount = getNullValues(dataCrimes)
        nullCount.show()

        #Eliminate null values
        numberNotNull=dataCrimes.where(dataCrimes["occurrenceyear"].isNotNull() & dataCrimes["occurrencemonth"].isNotNull() & dataCrimes["occurrenceday"].isNotNull() & dataCrimes["occurrencedayofyear"].isNotNull() & dataCrimes["occurrencedayofweek"].isNotNull())

        # Feature metrics
        dataCrimes.describe().show()


    return numberNotNull

dataSetCreactionCrime("./data/MCI_2014_to_2017.csv",1)

root
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- Index_: integer (nullable = true)
 |-- event_unique_id: string (nullable = true)
 |-- occurrencedate: timestamp (nullable = true)
 |-- reporteddate: timestamp (nullable = true)
 |-- premisetype: string (nullable = true)
 |-- ucr_code: integer (nullable = true)
 |-- ucr_ext: integer (nullable = true)
 |-- offence: string (nullable = true)
 |-- reportedyear: integer (nullable = true)
 |-- reportedmonth: string (nullable = true)
 |-- reportedday: integer (nullable = true)
 |-- reporteddayofyear: integer (nullable = true)
 |-- reporteddayofweek: string (nullable = true)
 |-- reportedhour: integer (nullable = true)
 |-- occurrenceyear: integer (nullable = true)
 |-- occurrencemonth: string (nullable = true)
 |-- occurrenceday: integer (nullable = true)
 |-- occurrencedayofyear: integer (nullable = true)
 |-- occurrencedayofweek: string (nullable = true)
 |-- occurrencehour: integer (nullable = true)
 |-- MCI: string

DataFrame[X: double, Y: double, Index_: int, event_unique_id: string, occurrencedate: timestamp, reporteddate: timestamp, premisetype: string, ucr_code: int, ucr_ext: int, offence: string, reportedyear: int, reportedmonth: string, reportedday: int, reporteddayofyear: int, reporteddayofweek: string, reportedhour: int, occurrenceyear: int, occurrencemonth: string, occurrenceday: int, occurrencedayofyear: int, occurrencedayofweek: string, occurrencehour: int, MCI: string, Division: string, Hood_ID: int, Neighbourhood: string, Lat: double, Long: double, FID: int]