In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
spark = SparkSession.builder.appName('722').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/19 06:49:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Import dataset by using pandas
import pandas as pd

crime_data_pd=pd.read_excel("Crime.xlsx")
education_data_pd=pd.read_excel("Education.xlsx")

#Transfer pandas dataframe to spark dataframe
crime_data= spark.createDataFrame(crime_data_pd)
education_data= spark.createDataFrame(education_data_pd)

In [3]:
#2 Data Understanding
#2.2 Data Decription
crime_data.show()
education_data.show()


                                                                                

+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+-----------------------------------------------------+------------------------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of harmful digital communication offense|Total amount of drugs offences cases|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+-----------------------------------------------------+------------------------------------+
|     Northland|2014|                                123|                                  692|                               4335|                                                  NaN|                                 339|
|     Northland|2015|                                138|                                  817|             

In [4]:
#2.3 Data exploration/Display value for each column

def group_and_sum(dataframe, columns):
    for column in columns:
        dataframe.groupBy("Region").sum(column).show()


crime_columns = [
    "Total amount of convicted juveniles",
    "Total amount of family violence cases",
    "Total amount of drugs offences cases",
    "Total amount of people with charges",
    "Total amount of harmful digital communication offense"
]

education_columns = [
    "Total amount of schools",
    "Total amount of Students",
    "Total amount of student attending regularly",
    "Participation in ECE(early childhood education)",
    "Mean household income"
]


group_and_sum(crime_data, crime_columns)
group_and_sum(education_data, education_columns)


+--------------------+----------------------------------------+
|              Region|sum(Total amount of convicted juveniles)|
+--------------------+----------------------------------------+
|          Wellington|                                     186|
|            Auckland|                                     765|
|             Waikato|                                    1221|
|      South Auckland|                                    1833|
|               Otago|                                     495|
|       Bay of Plenty|                                     846|
|          Canterbury|                                    1371|
|           Northland|                                     867|
|           Southland|                                     456|
|Nelson/Marlboroug...|                                     528|
|           Waitematā|                                    1113|
|  Taranaki/Whanganui|                                     738|
|            Waiariki|                  

+--------------------+------------------------------------------------+
|              Region|sum(Total amount of student attending regularly)|
+--------------------+------------------------------------------------+
|          Wellington|                              413768.33999999997|
|            Auckland|                              1286417.8450000002|
|             Waikato|                                      351773.806|
|      South Auckland|                               59098.83899999999|
|               Otago|                              165335.59900000002|
|       Bay of Plenty|                              255676.36899999998|
|          Canterbury|                              464899.91500000004|
|           Northland|                              114122.55500000001|
|           Southland|                                       85321.336|
|Nelson/Marlboroug...|                                       36133.513|
|           Waitematā|                                          

In [5]:
#2.4 Data Quality
#2.4.1 Calculate Missing Value

from pyspark.sql.functions import col, sum as spark_sum, isnan

def calculate_total_missing_values(dataframe):

    missing_counts = dataframe.select([
        (spark_sum(col(column).isNull().cast("int")) + spark_sum(isnan(col(column)).cast("int"))).alias(column)
        for column in dataframe.columns
    ])
    
    total_missing_values = missing_counts.select(
        [spark_sum(col(column)) for column in missing_counts.columns]
    ).first()

    return sum(total_missing_values)


total_missing_crime_data = calculate_total_missing_values(crime_data)
total_missing_education_data = calculate_total_missing_values(education_data)


print(f"Total missing values in crime_data: {total_missing_crime_data}")
print(f"Total missing values in education_data: {total_missing_education_data}")


Total missing values in crime_data: 16
Total missing values in education_data: 157


In [6]:
#3 Data Preparation
#3.1 Data selection
#checking basic info for dataset and each column, crime dataset first
crime_data.show()


+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+-----------------------------------------------------+------------------------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of harmful digital communication offense|Total amount of drugs offences cases|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+-----------------------------------------------------+------------------------------------+
|     Northland|2014|                                123|                                  692|                               4335|                                                  NaN|                                 339|
|     Northland|2015|                                138|                                  817|             

In [7]:
#checking basic info for dataset and each column, education dataset first
education_data.show()

+--------------+----+-----------------------+------------------------+-------------------------------------------+---------------------+-----------------------------------------------+
|        Region|year|Total amount of schools|Total amount of Students|Total amount of student attending regularly|Mean household income|Participation in ECE(early childhood education)|
+--------------+----+-----------------------+------------------------+-------------------------------------------+---------------------+-----------------------------------------------+
|     Northland|2014|                  151.0|                 22169.0|                                  12946.696|              66248.0|                                         7028.0|
|     Northland|2015|                  152.0|                 20037.0|                         12162.458999999999|              68880.0|                                         6939.0|
|     Northland|2016|                  152.0|                 22336.0|     

In [8]:
#drop the unnecessary column
crime_data=crime_data.drop("Total amount of harmful digital communication offense")
crime_data.show()

+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of drugs offences cases|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+
|     Northland|2014|                                123|                                  692|                               4335|                                 339|
|     Northland|2015|                                138|                                  817|                               4528|                                 341|
|     Northland|2016|                                117|                                  772|                               4435|                        

In [9]:
#3.2 Data cleaning
#3.2.1 Missing value

def calculate_mean_values(dataframe, exclude_columns):
    dataframe = dataframe.fillna(0)
    mean_values = dataframe.agg(
        *[avg(col(column)).alias(column) for column in dataframe.columns if column not in exclude_columns]
    ).first().asDict()
    return mean_values

def replace_missing_values_with_mean(dataframe, exclude_columns):
    mean_values = calculate_mean_values(dataframe, exclude_columns)
    dataframe = dataframe.fillna(mean_values)
    return dataframe


exclude_columns=["Region","Year"]

crime_data=replace_missing_values_with_mean(crime_data, exclude_columns)
education_data=replace_missing_values_with_mean(education_data, exclude_columns)


total_missing_crime_data_update=calculate_total_missing_values(crime_data)
total_missing_education_data_update=calculate_total_missing_values(education_data)

print(f"Total missing values in crime_data after dealing: {total_missing_crime_data_update}")
print(f"Total missing values in education_data after dealing: {total_missing_education_data_update}")

Total missing values in crime_data after dealing: 0
Total missing values in education_data after dealing: 0


In [10]:
#3.2.1 Outliers and Extreme values，use quantile(0.1/0.9) to define outliners and extreme values
from pyspark.sql.functions import col, mean, when

spark = SparkSession.builder.appName("OutlierReplacement").getOrCreate()

crime_data_columns = crime_data.columns
print(crime_data_columns)

crime_data_needed_columns = [
    'Total amount of convicted juveniles', 
    'Total amount of family violence cases', 
    'Total amount of people with charges', 
    'Total amount of drugs offences cases'
]

education_data_columns = education_data.columns
print(education_data_columns)

education_data_needed_columns = [
    'Total amount of schools', 
    'Total amount of Students', 
    'Total amount of student attending regularly', 
    'Mean household income', 
    'Participation in ECE(early childhood education)'
]

def replace_outliers(dataframe, columns, lower_quantile=0.1, upper_quantile=0.9):
    for column in columns:
        Q1 = dataframe.approxQuantile(column, [lower_quantile], 0.05)[0]
        Q3 = dataframe.approxQuantile(column, [upper_quantile], 0.05)[0]
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR
        
        mean_value = dataframe.select(mean(col(column))).collect()[0][0]
        
        dataframe = dataframe.withColumn(
            column,
            when(col(column) < lower_bound, mean_value).when(col(column) > upper_bound, mean_value).otherwise(col(column))
        )
        
    return dataframe


crime_data = replace_outliers(crime_data, crime_data_needed_columns)
education_data = replace_outliers(education_data, education_data_needed_columns)


crime_data.show()
education_data.show()


['Region', 'year', 'Total amount of convicted juveniles', 'Total amount of family violence cases', 'Total amount of people with charges', 'Total amount of drugs offences cases']
['Region', 'year', 'Total amount of schools', 'Total amount of Students', 'Total amount of student attending regularly', 'Mean household income', 'Participation in ECE(early childhood education)']
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of drugs offences cases|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+
|     Northland|2014|                              123.0|                                692.0|                       

In [11]:
#Check if outliers still exist

def check_outliers(dataframe, columns, lower_quantile=0.1, upper_quantile=0.9):
    outlier_counts = {}
    for column in columns:
        Q1 = dataframe.approxQuantile(column, [lower_quantile], 0.05)[0]
        Q3 = dataframe.approxQuantile(column, [upper_quantile], 0.05)[0]
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR
        
        outliers = dataframe.filter((col(column) < lower_bound) | (col(column) > upper_bound))
        count_of_outliers = outliers.count()
        
        outlier_counts[column] = count_of_outliers
    
    return outlier_counts

print(check_outliers(crime_data,crime_data_needed_columns))
print(check_outliers(education_data,education_data_needed_columns))


{'Total amount of convicted juveniles': 0, 'Total amount of family violence cases': 0, 'Total amount of people with charges': 0, 'Total amount of drugs offences cases': 0}
{'Total amount of schools': 0, 'Total amount of Students': 0, 'Total amount of student attending regularly': 0, 'Mean household income': 0, 'Participation in ECE(early childhood education)': 0}


In [12]:
#3.3 Constructing new features
from pyspark.sql.functions import round

education_data = education_data.withColumn(
    "Student Attendance",
    (col("Total amount of student attending regularly") / col("Total amount of Students") * 100).cast("double")
)

education_data = education_data.withColumn("Student Attendance", round(col("Student Attendance"), 1))

education_data = education_data.drop("Total amount of student attending regularly", "Total amount of Students")

education_data.show(20)

+--------------+----+-----------------------+---------------------+-----------------------------------------------+------------------+
|        Region|year|Total amount of schools|Mean household income|Participation in ECE(early childhood education)|Student Attendance|
+--------------+----+-----------------------+---------------------+-----------------------------------------------+------------------+
|     Northland|2014|                  151.0|              66248.0|                                         7028.0|              58.4|
|     Northland|2015|                  152.0|              68880.0|                                         6939.0|              60.7|
|     Northland|2016|                  152.0|              73469.0|                                         7200.0|              55.9|
|     Northland|2017|                  151.0|              75326.0|                                         7422.0|              50.9|
|     Northland|2018|                  151.0|          

In [13]:
#3.4 Data Integrating, using join to combine two datasets (sql)

Integrated_data = crime_data.join(education_data, on=["Region", "Year"], how="left")
Integrated_data.show()

+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+-----------------------+---------------------+-----------------------------------------------+------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of drugs offences cases|Total amount of schools|Mean household income|Participation in ECE(early childhood education)|Student Attendance|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+-----------------------+---------------------+-----------------------------------------------+------------------+
|South Auckland|2015|                              297.0|                               1484.0|                            10484.0|                       

In [14]:
#4.1.1 Feature Selection, choose four of most significant fields for target variables
from pyspark.ml.feature import VectorAssembler, UnivariateFeatureSelector
from pyspark.sql.functions import col


Copy_data = Integrated_data


useless_columns = ['Region', 'year', 'Total amount of convicted juveniles']
needed_columns = []

for column in Copy_data.columns:
    if column not in useless_columns:
        needed_columns.append(column)


features_combination = VectorAssembler(
    inputCols=needed_columns,
    outputCol='useful_features_list'
)


data_with_features = features_combination.transform(Copy_data)


features_selection = UnivariateFeatureSelector(
    featuresCol='useful_features_list',
    labelCol='Total amount of convicted juveniles',
    selectionMode='numTopFeatures'
)

features_selection.setFeatureType('continuous').setLabelType('continuous').setSelectionThreshold(4)


model = features_selection.fit(data_with_features)


selected_indices = model.selectedFeatures

for indices in selected_indices:
    print(needed_columns[indices])

    
selected_feature_names = [needed_columns[i] for i in selected_indices]
    
Final_data = Integrated_data.select(['Region', 'year', 'Total amount of convicted juveniles'] + selected_feature_names)

Final_data.show()

Total amount of family violence cases
Total amount of people with charges
Total amount of drugs offences cases
Mean household income
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of drugs offences cases|Mean household income|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------+
|South Auckland|2015|                              297.0|                               1484.0|                            10484.0|                               565.0|    64268.58564814815|
|      Auckland|2016|                               87.0|                                961.0|        

In [15]:
#4.2 Data reduction, delete those useless regions

useless_regions=['Northern Wellington','East Coast','Waiariki','Waitematā']
Final_data=Final_data.filter(~Final_data['Region'].isin(useless_regions))


Final_data.count()
Final_data.show()

+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------+
|        Region|year|Total amount of convicted juveniles|Total amount of family violence cases|Total amount of people with charges|Total amount of drugs offences cases|Mean household income|
+--------------+----+-----------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------+
|South Auckland|2015|                              297.0|                               1484.0|                            10484.0|                               565.0|    64268.58564814815|
|      Auckland|2016|                               87.0|                                961.0|                             7472.0|                               669.0|             115091.0|
|      Auckland|2020|                        

In [16]:
#6/7 Linear Regression, find out coefficient for each field

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler


columns_list = Final_data.columns
useless_columns = ['Region', 'year', 'Total amount of convicted juveniles']
needed_features = []

for column in columns_list:
    if column not in useless_columns:
        needed_features.append(column)

print(needed_features)


features_combination = VectorAssembler(
    inputCols=needed_features,
    outputCol='features'
)


data_with_features = features_combination.transform(Final_data)

data_with_features.printSchema()

research_data_1=data_with_features.select('features','Total amount of convicted juveniles')
research_data_1.show()

train_data_1,test_data_1 = research_data_1.randomSplit([0.1,0.9])


r1 = LinearRegression(labelCol='Total amount of convicted juveniles')
r1_training = r1.fit(train_data_1)
print("Coefficients: {} Intercept: {}".format(r1_training.coefficients,r1_training.intercept))
train_results =r1_training.evaluate(train_data_1)

r1_testing=r1.fit(test_data_1)
print("Coefficients: {} Intercept: {}".format(r1_testing.coefficients,r1_testing.intercept))
test_results =r1_testing.evaluate(test_data_1)


#evalute result by using RSME and R2 (in shohil-kishore repo)
test_results.residuals.show()
print("RSME: {}".format(test_results.rootMeanSquaredError))

print("R2: {}".format(test_results.r2))

['Total amount of family violence cases', 'Total amount of people with charges', 'Total amount of drugs offences cases', 'Mean household income']
root
 |-- Region: string (nullable = true)
 |-- year: long (nullable = true)
 |-- Total amount of convicted juveniles: double (nullable = true)
 |-- Total amount of family violence cases: double (nullable = true)
 |-- Total amount of people with charges: double (nullable = true)
 |-- Total amount of drugs offences cases: double (nullable = true)
 |-- Mean household income: double (nullable = true)
 |-- features: vector (nullable = true)

+--------------------+-----------------------------------+
|            features|Total amount of convicted juveniles|
+--------------------+-----------------------------------+
|[1104.0,5758.0,45...|                              162.0|
|[1142.0,7797.0,48...|                              153.0|
|[713.0,3030.0,234...|                               57.0|
|[1584.0,9366.0,58...|                              201.0|

24/05/19 06:50:06 WARN Instrumentation: [fe7c53cd] regParam is zero, which might cause numerical instability and overfitting.
24/05/19 06:50:06 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/19 06:50:06 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
24/05/19 06:50:06 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Coefficients: [-0.09813496899794395,0.04750424992050634,-0.2232592549852723,-0.00016046583123797864] Intercept: 58.48419221110032


24/05/19 06:50:08 WARN Instrumentation: [86fae73b] regParam is zero, which might cause numerical instability and overfitting.


Coefficients: [0.03486583100106817,0.02305024618977872,-0.08693432763016144,-0.00023642609609127147] Intercept: 17.358086064833564




+-------------------+
|          residuals|
+-------------------+
|  12.30549992372637|
|  21.82893879618746|
| -7.696368902082014|
|   6.25056453179473|
| -6.546302723768939|
|  4.617236234324828|
| 3.4877843983987376|
| -18.31948355510187|
| 15.190517754450326|
|  7.182038814437242|
|-2.2630945383423864|
| -15.97475573328342|
|-13.213647009461475|
| -18.35178097779719|
| 7.0351271991298105|
|-11.427027243707279|
|-16.847158073092885|
|  12.64576972331767|
| -25.67200124482885|
|-12.073052166347935|
+-------------------+
only showing top 20 rows

RSME: 23.055956440734132
R2: 0.8132341842853512


In [22]:
#6/7 Decision Tree models
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler


columns_list = Final_data.columns
useless_columns = ['Region', 'year', 'Total amount of convicted juveniles']
needed_features = []

for column in columns_list:
    if column not in useless_columns:
        needed_features.append(column)

print(needed_features)


features_combination = VectorAssembler(
    inputCols=needed_features,
    outputCol='features'
)


data_with_features = features_combination.transform(Final_data)

data_with_features.printSchema()

research_data_2=data_with_features.select('features','Total amount of convicted juveniles')
research_data_2.show()

train_data_2,test_data_2 = research_data_2.randomSplit([0.1,0.9],seed=42)

d1 = DecisionTreeRegressor(labelCol='Total amount of convicted juveniles', featuresCol='features',maxDepth=5)

d1_model = d1.fit(train_data_2)

#Using train dataset to test model‘s ablity, and use test dataset to check if the predict's ability
predictions_test= d1_model.transform(test_data_2)

evaluator = RegressionEvaluator(
    labelCol='Total amount of convicted juveniles', predictionCol='prediction', metricName='rmse'
)

test_rmse_1 = evaluator.evaluate(predictions_test)
print(f"Root Mean Squared Error (RMSE) on test data = {test_rmse_1}")

print(f"Learned regression tree model:\n{d1_model.toDebugString}")

['Total amount of family violence cases', 'Total amount of people with charges', 'Total amount of drugs offences cases', 'Mean household income']
root
 |-- Region: string (nullable = true)
 |-- year: long (nullable = true)
 |-- Total amount of convicted juveniles: double (nullable = true)
 |-- Total amount of family violence cases: double (nullable = true)
 |-- Total amount of people with charges: double (nullable = true)
 |-- Total amount of drugs offences cases: double (nullable = true)
 |-- Mean household income: double (nullable = true)
 |-- features: vector (nullable = true)

+--------------------+-----------------------------------+
|            features|Total amount of convicted juveniles|
+--------------------+-----------------------------------+
|[1104.0,5758.0,45...|                              162.0|
|[1142.0,7797.0,48...|                              153.0|
|[713.0,3030.0,234...|                               57.0|
|[1584.0,9366.0,58...|                              201.0|

24/05/19 06:57:59 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 14 (= number of training instances)


Root Mean Squared Error (RMSE) on test data = 36.21412384633165
Learned regression tree model:
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_6c6caf91dbe0, depth=5, numNodes=23, numFeatures=4
  If (feature 1 <= 8744.5)
   If (feature 1 <= 6093.5)
    If (feature 0 <= 477.5)
     If (feature 0 <= 444.5)
      Predict: 9.0
     Else (feature 0 > 444.5)
      Predict: 48.0
    Else (feature 0 > 477.5)
     If (feature 0 <= 615.5)
      Predict: 66.0
     Else (feature 0 > 615.5)
      If (feature 0 <= 820.0)
       Predict: 99.0
      Else (feature 0 > 820.0)
       Predict: 82.5
   Else (feature 1 > 6093.5)
    If (feature 0 <= 923.0)
     If (feature 0 <= 853.0)
      Predict: 123.0
     Else (feature 0 > 853.0)
      Predict: 120.0
    Else (feature 0 > 923.0)
     If (feature 0 <= 1085.0)
      Predict: 189.0
     Else (feature 0 > 1085.0)
      If (feature 0 <= 1337.5)
       Predict: 138.0
      Else (feature 0 > 1337.5)
       Predict: 120.0
  Else (feature 1 > 8744.5)
   I

In [21]:
#6/7 Random Forest
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler


columns_list = Final_data.columns
useless_columns = ['Region', 'year', 'Total amount of convicted juveniles']
needed_features = []

for column in columns_list:
    if column not in useless_columns:
        needed_features.append(column)

print(needed_features)


features_combination = VectorAssembler(
    inputCols=needed_features,
    outputCol='features'
)


data_with_features = features_combination.transform(Final_data)

data_with_features.printSchema()

research_data_3=data_with_features.select('features','Total amount of convicted juveniles')
research_data_3.show()

train_data_3,test_data_3 = research_data_3.randomSplit([0.1,0.9])

rf=RandomForestRegressor(labelCol='Total amount of convicted juveniles', featuresCol='features', numTrees=5, maxDepth=5)

rf_model = rf.fit(train_data_3)

predictions_test_2=rf_model.transform(test_data_3)

evaluator = RegressionEvaluator(
    labelCol='Total amount of convicted juveniles', predictionCol='prediction', metricName='rmse'
)


test_rmse_2 = evaluator.evaluate(predictions_test_2)
print(f"Root Mean Squared Error (RMSE) on test data = {test_rmse_2}")

print(f"Learned random forest model:\n{rf_model.toDebugString}")



['Total amount of family violence cases', 'Total amount of people with charges', 'Total amount of drugs offences cases', 'Mean household income']
root
 |-- Region: string (nullable = true)
 |-- year: long (nullable = true)
 |-- Total amount of convicted juveniles: double (nullable = true)
 |-- Total amount of family violence cases: double (nullable = true)
 |-- Total amount of people with charges: double (nullable = true)
 |-- Total amount of drugs offences cases: double (nullable = true)
 |-- Mean household income: double (nullable = true)
 |-- features: vector (nullable = true)

+--------------------+-----------------------------------+
|            features|Total amount of convicted juveniles|
+--------------------+-----------------------------------+
|[1104.0,5758.0,45...|                              162.0|
|[1142.0,7797.0,48...|                              153.0|
|[713.0,3030.0,234...|                               57.0|
|[1584.0,9366.0,58...|                              201.0|

24/05/19 06:57:23 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)


Root Mean Squared Error (RMSE) on test data = 36.80865458871838
Learned random forest model:
RandomForestRegressionModel: uid=RandomForestRegressor_f4bad80ff34c, numTrees=5, numFeatures=4
  Tree 0 (weight 1.0):
    If (feature 1 <= 8744.5)
     If (feature 0 <= 637.5)
      If (feature 1 <= 2476.5)
       If (feature 0 <= 474.5)
        Predict: 18.0
       Else (feature 0 > 474.5)
        Predict: 15.0
      Else (feature 1 > 2476.5)
       If (feature 3 <= 68868.79282407407)
        Predict: 51.0
       Else (feature 3 > 68868.79282407407)
        Predict: 60.0
     Else (feature 0 > 637.5)
      If (feature 3 <= 68868.79282407407)
       Predict: 120.0
      Else (feature 3 > 68868.79282407407)
       If (feature 2 <= 390.5)
        Predict: 93.0
       Else (feature 2 > 390.5)
        Predict: 72.0
    Else (feature 1 > 8744.5)
     Predict: 201.0
  Tree 1 (weight 1.0):
    If (feature 0 <= 868.0)
     If (feature 0 <= 474.5)
      Predict: 18.0
     Else (feature 0 > 474.5)
      