In [2]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.1.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [913 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Get:12 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packag

In [3]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-09-29 22:25:52--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-09-29 22:25:54 (1.17 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Recidivism_Prediction_Cloud").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [5]:
from pyspark import SparkFiles
url = "https://vasudha-bucket.s3.ca-central-1.amazonaws.com/recidivism_data.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get(""), sep=",", header=True, inferSchema=True)
df.show()

+--------------------+-------------------------+-------------------------+--------------------+--------------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|Fiscal Year Released|Recidivism Reporting Year|Main Supervising District|        Release Type|    Race - Ethnicity|Age At Release |   Sex|Offense Classification|Offense Type|Offense Subtype|Return to Prison|Days to Return|Recidivism Type|New Offense Classification|New Offense Type|New Offense Sub Type|Target Population|
+--------------------+-------------------------+-------------------------+--------------------+--------------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|                2010|         

In [6]:
df.columns

['Fiscal Year Released',
 'Recidivism Reporting Year',
 'Main Supervising District',
 'Release Type',
 'Race - Ethnicity',
 'Age At Release ',
 'Sex',
 'Offense Classification',
 'Offense Type',
 'Offense Subtype',
 'Return to Prison',
 'Days to Return',
 'Recidivism Type',
 'New Offense Classification',
 'New Offense Type',
 'New Offense Sub Type',
 'Target Population']

In [7]:
df.dtypes

[('Fiscal Year Released', 'int'),
 ('Recidivism Reporting Year', 'int'),
 ('Main Supervising District', 'string'),
 ('Release Type', 'string'),
 ('Race - Ethnicity', 'string'),
 ('Age At Release ', 'string'),
 ('Sex', 'string'),
 ('Offense Classification', 'string'),
 ('Offense Type', 'string'),
 ('Offense Subtype', 'string'),
 ('Return to Prison', 'string'),
 ('Days to Return', 'string'),
 ('Recidivism Type', 'string'),
 ('New Offense Classification', 'string'),
 ('New Offense Type', 'string'),
 ('New Offense Sub Type', 'string'),
 ('Target Population', 'string')]

In [8]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import col

In [9]:
# Drop columns
df=df.drop('Recidivism Reporting Year', 'Convicting Offense Subtype', 'Main Supervising District', 'Release type: Paroled to Detainder united', 'Part of Target Population')


In [10]:
df.show()

+--------------------+--------------------+--------------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|Fiscal Year Released|        Release Type|    Race - Ethnicity|Age At Release |   Sex|Offense Classification|Offense Type|Offense Subtype|Return to Prison|Days to Return|Recidivism Type|New Offense Classification|New Offense Type|New Offense Sub Type|Target Population|
+--------------------+--------------------+--------------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|                2010|              Parole|Black - Non-Hispanic|          25-34|  Male|              C Felony|     Violent|        Robbery|             Yes|           433|            New|

In [11]:
df.where(col("Race - Ethnicity")  == "N/A -").show()


+--------------------+--------------------+----------------+---------------+----+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|Fiscal Year Released|        Release Type|Race - Ethnicity|Age At Release | Sex|Offense Classification|Offense Type|Offense Subtype|Return to Prison|Days to Return|Recidivism Type|New Offense Classification|New Offense Type|New Offense Sub Type|Target Population|
+--------------------+--------------------+----------------+---------------+----+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|                2011|Discharged – End ...|           N/A -|          25-34|Male|              D Felony|       Other| Other Criminal|              No|          null|  No Recidivism|                      nu

In [12]:
df.where(col("Race - Ethnicity").isNotNull())

DataFrame[Fiscal Year Released: int, Release Type: string, Race - Ethnicity: string, Age At Release : string, Sex: string, Offense Classification: string, Offense Type: string, Offense Subtype: string, Return to Prison: string, Days to Return: string, Recidivism Type: string, New Offense Classification: string, New Offense Type: string, New Offense Sub Type: string, Target Population: string]

In [13]:
from pyspark.sql.functions import regexp_replace
# df.withColumn("Race - Ethnicity", regexp_replace("Race - Ethnicity", "White - Hispanic", "Hispanic")) \
#   .show()
from pyspark.sql.functions import when
df = df.withColumn("Race - Ethnicity", when(col("Race - Ethnicity") == "White - Hispanic","Hispanic") \
      .when(col("Race - Ethnicity") == "White - Non-Hispanic","White") \
      .when(col("Race - Ethnicity") == "Black - Hispanic","Hispanic") \
      .when(col("Race - Ethnicity") == "Black - Non-Hispanic","Black") \
      .when(col("Race - Ethnicity") == "Asian or Pacific Islander - Hispanic","Hispanic") \
      .when(col("Race - Ethnicity") == "Asian or Pacific Islander - Non-Hispanic","Asian") \
      .when(col("Race - Ethnicity") == "American Indian or Alaska Native - Hispanic","Hispanic") \
      .when(col("Race - Ethnicity") == "American Indian or Alaska Native - Non-Hispanic","Native") \
      .when(col("Race - Ethnicity") == "White -","White") \
      .when(col("Race - Ethnicity") == "Black -","Black") )

df.show()


+--------------------+--------------------+----------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|Fiscal Year Released|        Release Type|Race - Ethnicity|Age At Release |   Sex|Offense Classification|Offense Type|Offense Subtype|Return to Prison|Days to Return|Recidivism Type|New Offense Classification|New Offense Type|New Offense Sub Type|Target Population|
+--------------------+--------------------+----------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|                2010|              Parole|           Black|          25-34|  Male|              C Felony|     Violent|        Robbery|             Yes|           433|            New|                

In [14]:
df = df.withColumn("Release Type", when(col("Release Type") == "Paroled to Detainer - INS","Paroled to Detainer") \
      .when(col("Release Type") == "Paroled to Detainer - Iowa","Paroled to Detainer") \
      .when(col("Release Type") == "Paroled to Detainer - Out of State","Paroled to Detainer") \
      .when(col("Release Type") == "Paroled to Detainer - U.S. Marshall","Paroled to Detainer") )

df.show()


+--------------------+------------+----------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|Fiscal Year Released|Release Type|Race - Ethnicity|Age At Release |   Sex|Offense Classification|Offense Type|Offense Subtype|Return to Prison|Days to Return|Recidivism Type|New Offense Classification|New Offense Type|New Offense Sub Type|Target Population|
+--------------------+------------+----------------+---------------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+
|                2010|        null|           Black|          25-34|  Male|              C Felony|     Violent|        Robbery|             Yes|           433|            New|                  C Felony|            Drug|    

In [15]:
df.fillna(value='Other',subset=["Release Type"])

DataFrame[Fiscal Year Released: int, Release Type: string, Race - Ethnicity: string, Age At Release : string, Sex: string, Offense Classification: string, Offense Type: string, Offense Subtype: string, Return to Prison: string, Days to Return: string, Recidivism Type: string, New Offense Classification: string, New Offense Type: string, New Offense Sub Type: string, Target Population: string]

In [16]:
df = df.withColumnRenamed("Fiscal Year Released","Year Released")\
  .withColumnRenamed( "Race - Ethnicity","Race")\
  .withColumnRenamed( "Age At Release ","Age")\
  .withColumnRenamed( "Convicting Offense Classification","Offense Classification")\
  .withColumnRenamed( "Convicting Offense Type","Offense Type")\
  .withColumnRenamed( "Recidivism - Return to Prison numeric","Recidivism")
df.printSchema()

  

root
 |-- Year Released: integer (nullable = true)
 |-- Release Type: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Offense Classification: string (nullable = true)
 |-- Offense Type: string (nullable = true)
 |-- Offense Subtype: string (nullable = true)
 |-- Return to Prison: string (nullable = true)
 |-- Days to Return: string (nullable = true)
 |-- Recidivism Type: string (nullable = true)
 |-- New Offense Classification: string (nullable = true)
 |-- New Offense Type: string (nullable = true)
 |-- New Offense Sub Type: string (nullable = true)
 |-- Target Population: string (nullable = true)



In [17]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import monotonically_increasing_id
df1 = df.withColumn("ID", row_number().over(Window.orderBy(monotonically_increasing_id())))


In [18]:
df1.show()

+-------------+------------+-----+--------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+---+
|Year Released|Release Type| Race|     Age|   Sex|Offense Classification|Offense Type|Offense Subtype|Return to Prison|Days to Return|Recidivism Type|New Offense Classification|New Offense Type|New Offense Sub Type|Target Population| ID|
+-------------+------------+-----+--------+------+----------------------+------------+---------------+----------------+--------------+---------------+--------------------------+----------------+--------------------+-----------------+---+
|         2010|        null|Black|   25-34|  Male|              C Felony|     Violent|        Robbery|             Yes|           433|            New|                  C Felony|            Drug|         Trafficking|              Yes|  1|
|         2010|        null|White|   25-34|  Mal

In [19]:
# prisoner_df = df.select(['Year Released','Release Type', 'Age','Race','Sex'])
# prisoner_df.show()

+-------------+------------+--------+-----+------+
|Year Released|Release Type|     Age| Race|   Sex|
+-------------+------------+--------+-----+------+
|         2010|        null|   25-34|Black|  Male|
|         2010|        null|   25-34|White|  Male|
|         2010|        null|   35-44|White|  Male|
|         2010|        null|   25-34|White|  Male|
|         2010|        null|   35-44|Black|  Male|
|         2010|        null|   25-34|White|  Male|
|         2010|        null|   25-34|White|  Male|
|         2010|        null|   35-44|Black|  Male|
|         2010|        null|   25-34|White|  Male|
|         2010|        null|   25-34|Black|  Male|
|         2010|        null|   25-34|White|  Male|
|         2010|        null|   25-34|White|  Male|
|         2010|        null|   35-44|Black|  Male|
|         2010|        null|   45-54|White|  Male|
|         2010|        null|   45-54|White|  Male|
|         2010|        null|Under 25|Black|  Male|
|         2010|        null|Und

In [28]:
prisoner_df.fillna(value='Other',subset=["Release Type"]).show()

+---+-------------+------------+--------+-----+------+
| ID|Year Released|Release Type|     Age| Race|   Sex|
+---+-------------+------------+--------+-----+------+
|  1|         2010|       Other|   25-34|Black|  Male|
|  2|         2010|       Other|   25-34|White|  Male|
|  3|         2010|       Other|   35-44|White|  Male|
|  4|         2010|       Other|   25-34|White|  Male|
|  5|         2010|       Other|   35-44|Black|  Male|
|  6|         2010|       Other|   25-34|White|  Male|
|  7|         2010|       Other|   25-34|White|  Male|
|  8|         2010|       Other|   35-44|Black|  Male|
|  9|         2010|       Other|   25-34|White|  Male|
| 10|         2010|       Other|   25-34|Black|  Male|
| 11|         2010|       Other|   25-34|White|  Male|
| 12|         2010|       Other|   25-34|White|  Male|
| 13|         2010|       Other|   35-44|Black|  Male|
| 14|         2010|       Other|   45-54|White|  Male|
| 15|         2010|       Other|   45-54|White|  Male|
| 16|     

In [27]:
prisoner_df = df1.select(['ID','Year Released','Release Type', 'Age','Race','Sex'])
prisoner_df.show()

+---+-------------+------------+--------+-----+------+
| ID|Year Released|Release Type|     Age| Race|   Sex|
+---+-------------+------------+--------+-----+------+
|  1|         2010|        null|   25-34|Black|  Male|
|  2|         2010|        null|   25-34|White|  Male|
|  3|         2010|        null|   35-44|White|  Male|
|  4|         2010|        null|   25-34|White|  Male|
|  5|         2010|        null|   35-44|Black|  Male|
|  6|         2010|        null|   25-34|White|  Male|
|  7|         2010|        null|   25-34|White|  Male|
|  8|         2010|        null|   35-44|Black|  Male|
|  9|         2010|        null|   25-34|White|  Male|
| 10|         2010|        null|   25-34|Black|  Male|
| 11|         2010|        null|   25-34|White|  Male|
| 12|         2010|        null|   25-34|White|  Male|
| 13|         2010|        null|   35-44|Black|  Male|
| 14|         2010|        null|   45-54|White|  Male|
| 15|         2010|        null|   45-54|White|  Male|
| 16|     

In [22]:
offense_type_df = df1[['ID','Offense Classification','Offense Type','Offense Subtype']]
offense_type_df.show()

+---+----------------------+------------+---------------+
| ID|Offense Classification|Offense Type|Offense Subtype|
+---+----------------------+------------+---------------+
|  1|              C Felony|     Violent|        Robbery|
|  2|              D Felony|    Property|          Theft|
|  3|              B Felony|        Drug|    Trafficking|
|  4|              B Felony|       Other| Other Criminal|
|  5|              D Felony|     Violent|        Assault|
|  6|              C Felony|        Drug|    Trafficking|
|  7|              C Felony|        Drug|    Trafficking|
|  8|              D Felony|Public Order|            OWI|
|  9|     Felony - Enhanced|    Property|  Forgery/Fraud|
| 10|              C Felony|     Violent|  Other Violent|
| 11|              C Felony|        Drug|    Trafficking|
| 12|              C Felony|        Drug|    Trafficking|
| 13|              D Felony|        Drug|Drug Possession|
| 14|              D Felony|        Drug|     Other Drug|
| 15|         

In [23]:
Recidivism_details_df = df1[['ID','Days to Return','Recidivism Type','New Offense Classification','New Offense Type','New Offense Sub Type','Target Population','Return to Prison']]
Recidivism_details_df.show()

+---+--------------+---------------+--------------------------+----------------+--------------------+-----------------+----------------+
| ID|Days to Return|Recidivism Type|New Offense Classification|New Offense Type|New Offense Sub Type|Target Population|Return to Prison|
+---+--------------+---------------+--------------------------+----------------+--------------------+-----------------+----------------+
|  1|           433|            New|                  C Felony|            Drug|         Trafficking|              Yes|             Yes|
|  2|           453|           Tech|                      null|            null|                null|               No|             Yes|
|  3|           832|           Tech|                      null|            null|                null|              Yes|             Yes|
|  4|          null|  No Recidivism|                      null|            null|                null|              Yes|              No|
|  5|           116|           Tech|     

In [24]:
prisoner_df.dtypes

[('Year Released', 'int'),
 ('Release Type', 'string'),
 ('Age', 'string'),
 ('Race', 'string'),
 ('Sex', 'string')]

In [25]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://database-1.clf82frcjuur.ca-central-1.rds.amazonaws.com:5432/postgres"
config = {"user":"postgres", 
          "password": "Database_1", 
          "driver":"org.postgresql.Driver"}          

In [29]:
# Write df to prisoners table in postgres server
prisoner_df.write.jdbc(url=jdbc_url, table='PRISONERS', mode=mode, properties=config)

In [30]:
Recidivism_details_df.write.jdbc(url=jdbc_url, table='RECIDIVISM', mode=mode, properties=config)

In [31]:
offense_type_df.write.jdbc(url=jdbc_url, table='OFFENSE', mode=mode, properties=config)