# Parsing
### We chose to parse the dataset from Stanford as it has more information and characteristics than the one from Kaggle. The dataset from Kaggle is a subset of the one from Stanford, so we decided to use the original one.

##### load dataset

raw_row_number                          0
date                                    0
time                                    0
location                               91
lat                               8152359
lng                               8152288
county_name                            99
district                             9161
precinct                         12569006
region                                  0
subject_race                          236
subject_sex                           251
officer_id_hash                       250
type                                    1
violation                            1426
citation_issued                         1
warning_issued                          1
outcome                              1426
contraband_found                 19295939
contraband_drugs                 19295941
contraband_weapons               19295946
search_conducted                     1673
search_vehicle                     442621
search_basis                     19295951
vehicle_color                    14872995
...
raw_HA_SEARCH_PC_boolean             1674
raw_HA_SEARCH_CONCENT_boolean        1674
raw_HA_INCIDTO_ARREST_boolean        1674
raw_HA_VEHICLE_INVENT_boolean        1674

In [1]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName('DataFrame Optimization') \
    .getOrCreate()

# Load the DataFrame
df = spark.read.format('csv').option('header', 'true').load('../data/tx_statewide_2020_04_01-002.csv')

# Show the DataFrame
df.show()

24/04/30 09:45:21 WARN Utils: Your hostname, luxpc resolves to a loopback address: 127.0.1.1; using 10.92.4.119 instead (on interface wlan0)
24/04/30 09:45:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/30 09:45:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/04/30 09:45:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------+----------+--------+--------------------+---+---+-----------------+--------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+----------------+------------------+----------------+--------------+--------------+-------------+------------+-------------+------------+------------+---------------+------------------------+-----------------------------+-----------------------------+-----------------------------+
+--------------+----------+--------+--------------------+---+---+-----------------+--------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+----------------+------------------+----------------+--------------+--------------+-------------+------------+-------------+------------+------------+---------------+------------------------+-----------------------------+-----------------------

### Cleaning the dataset

##### Remove columns with name starting with "raw_" as they are not useful for our analysis

In [2]:
# drop columns with name starting with 'raw_'
df = df.drop(*[col for col in df.columns if col.startswith('raw_')])

##### Remove columns with more than 50% of missing values

In [3]:
from pyspark.sql.functions import col, count, when, isnan, isnull

# Calculate the number of records in the DataFrame
total_records = df.count()

# Create a new DataFrame that counts the number of nulls, NaNs, or Nones in each column
null_counts = df.select([count(when((col(c) == 'NA') | (col(c) == 'na') | isnan(c) | isnull(c), c)).alias(c) for c in df.columns])

# Convert the DataFrame to a dictionary
null_counts_dict = {c: null_counts.first()[c] for c in null_counts.columns}

# Drop columns where more than 50% of the values are null
df = df.drop(*[c for c, null_count in null_counts_dict.items() if null_count / total_records > 0.5])

24/04/30 09:45:35 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [4]:
# # Print the null counts DataFrame
null_counts.show()



+----+----+--------+-------+-------+-----------+--------+--------+------+------------+-----------+---------------+----+---------+---------------+--------------+-------+----------------+----------------+------------------+----------------+--------------+------------+-------------+------------+-------------+------------+------------+
+----+----+--------+-------+-------+-----------+--------+--------+------+------------+-----------+---------------+----+---------+---------------+--------------+-------+----------------+----------------+------------------+----------------+--------------+------------+-------------+------------+-------------+------------+------------+
|   0|   0|      91|8152359|8152288|         99|    9161|12569006|     0|         236|        251|            250|   1|     1426|              1|             1|   1426|        19293219|        19293864|          19295854|            4385|        444674|    19293327|     14873559|     7671434|      8966745|        1451|     8921042

                                                                                

In [None]:
# drop column officer_id_hash
df = df.drop('officer_id_hash')
# drop column district
df = df.drop('district')
# drop column region
df = df.drop('region')


In [5]:
# Show the DataFrame
df.show()

+----------+--------+--------------------+---+---+-----------------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+--------------+------------+-------------+------------+------------+
+----------+--------+--------------------+---+---+-----------------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+--------------+------------+-------------+------------+------------+
|2006-01-01|00:00:00|route: 0207, mile...| NA| NA|  Hansford County|       B|     5|       white|       male|     7621d63a65|vehicular|Speeding-10% or M...|           TRUE|         FALSE|citation|           FALSE|         FALSE|          NA|           NA|          PA|          NA|
|2006-01-01|00:00:00|route: 0105, mile...| NA| NA|Montgomery County|       C|     2|    hispanic|       male|     2c0d24dbbd|vehicular|Open Container in..

In [6]:
df.describe().show()

[Stage 93:>                                                        (0 + 8) / 40]

In [None]:
# show different values in type column
df.select('type').distinct().show()

In [None]:
# one hot encoding for categorical columns
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer

# change subject sex to 0 and 1
df = df.withColumn('subject_sex', when(col='male', 1).otherwise(0))

# change subject race to numerical values
df = df.withColumn('subject_race', when(col='White', 0).when(col='Black', 1).when(col='Hispanic', 2).when(col='Asian', 3).when(col='Other', 4))

# change type column to numerical values
df = df.withColumn('type', when(col='vehicular', 0).when(col='pedestrian', 1).when(col='bicycle', 2).when(col='other', 3))

# change outcome column to numerical values
df = df.withColumn('outcome', when(col='warning', 0).when(col='citation', 1).when(col='arrest', 2).when(col='no action', 3).when(col='verbal warning', 4))

# change citation_issued column to numerical values
columns_to_encode = ['citation_issued', 'Warning_issued', 'search_conducted', 'search_conducted']

for column in columns_to_encode:
    df = df.withColumn(column, when(col='True', 1).otherwise(0))

In [None]:
# clean types column

columns_to_clean = [ ('county_name', 'string'),
            ('subject_race', 'string'),
            ('subject_sex', 'int'),
            ('type', 'int'),
            ('violation', 'string'),
            ('citation_issued', 'int'),
            ('outcome', 'int'),
            ('search_conducted', 'int')
           ]
           
for column_name, column_type in columns_to_clean:
    df = df.withColumn(column_name, df[column_name].cast(column_type))

In [None]:
# export df to csv

# df.write.csv('../data/df.csv', header=True)