# Parsing
### We chose to parse the dataset from Stanford as it has more information and characteristics than the one from Kaggle. The dataset from Kaggle is a subset of the one from Stanford, so we decided to use the original one.

##### load dataset

In [18]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName('DataFrame Optimization') \
    .getOrCreate()

# Load the DataFrame
df = spark.read.format('csv').option('header', 'true').load('../data/tx_statewide_2020_04_01.csv')

# Show the DataFrame
df.show()

+--------------+----------+--------+--------------------+---+---+-----------------+--------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+----------------+------------------+----------------+--------------+--------------+-------------+------------+-------------+------------+------------+---------------+------------------------+-----------------------------+-----------------------------+-----------------------------+
+--------------+----------+--------+--------------------+---+---+-----------------+--------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+----------------+------------------+----------------+--------------+--------------+-------------+------------+-------------+------------+------------+---------------+------------------------+-----------------------------+-----------------------

##### how many rows are there?

In [19]:
df.count()

19752786

##### how many null values are there?

In [20]:
# Show number NaN values in violation column
df.filter(df['violation'].isNull()).count()

1426

# Cleaning the dataset

##### Remove columns with name starting with "raw_" as they are not useful for our analysis

In [21]:
# drop columns with name starting with 'raw_'
df = df.drop(*[col for col in df.columns if col.startswith('raw_')])

df.show()

+----------+--------+--------------------+---+---+-----------------+--------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+----------------+------------------+----------------+--------------+--------------+-------------+------------+-------------+------------+------------+
+----------+--------+--------------------+---+---+-----------------+--------+--------+------+------------+-----------+---------------+---------+--------------------+---------------+--------------+--------+----------------+----------------+------------------+----------------+--------------+--------------+-------------+------------+-------------+------------+------------+
|2006-01-01|00:00:00|route: 0207, mile...| NA| NA|  Hansford County|       B|      11|     5|       white|       male|     7621d63a65|vehicular|Speeding-10% or M...|           TRUE|         FALSE|citation|              NA|              NA|               

##### Remove columns with more than 50% of missing values

In [23]:
from pyspark.sql.functions import col, count, when, isnan, isnull

# Calculate the number of records in the DataFrame
total_records = df.count()

# Create a new DataFrame that counts the number of nulls, NaNs, or Nones in each column
null_counts = df.select([count(when((col(c) == 'NA') | (col(c) == 'na') | isnan(c) | isnull(c), c)).alias(c) for c in df.columns])

# Convert the DataFrame to a dictionary
null_counts_dict = {c: null_counts.first()[c] for c in null_counts.columns}

# Drop columns where more than 50% of the values are null
df = df.drop(*[c for c, null_count in null_counts_dict.items() if null_count / total_records > 0.5])

In [None]:
# # Print the null counts DataFrame
null_counts.show()

+----+----+--------+-------+-------+-----------+--------+--------+------+------------+-----------+---------------+----+---------+---------------+--------------+-------+----------------+----------------+------------------+----------------+--------------+------------+-------------+------------+-------------+------------+------------+
+----+----+--------+-------+-------+-----------+--------+--------+------+------------+-----------+---------------+----+---------+---------------+--------------+-------+----------------+----------------+------------------+----------------+--------------+------------+-------------+------------+-------------+------------+------------+
|   0|   0|      91|8152359|8152288|         99|    9161|12569006|     0|         236|        251|            250|   1|     1426|              1|             1|   1426|        19293219|        19293864|          19295854|            4385|        444674|    19293327|     14873559|     7671434|      8966745|        1451|     8921042

### remove useless columns

In [1]:
# drop column officer_id_hash
df = df.drop('officer_id_hash')
# drop column district
df = df.drop('district')
# drop column region
df = df.drop('region')
# drop column type
df = df.drop('type')
# drop column citation_issued (meaningless)
df = df.drop('citation_issued')
# drop column warning_issued (meaningless)
df = df.drop('warning_issued')

### DROP FOR NOW OPTMIZATIONS

# drop column outcome 
df = df.drop('outcome')
# drop column vehicle_make
df = df.drop('vehicle_make')
# drop column vehicle_model
df = df.drop('vehicle_model')
# drop column vehicle_type
df = df.drop('vehicle_type')

df.show()

NameError: name 'df' is not defined

In [None]:
# intermediary save
df.write.format('parquet').mode('overwrite').save('../data/tx_statewide_2020_04_01-002.parquet')

### Cleaning the dataset for Data Visualization

In [14]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName('DataFrame Optimization') \
    .getOrCreate()

df = spark.read.format('parquet').load('../data/tx_statewide_2020_04_01-002.parquet')

# drop column type
df = df.drop('type')
# drop column citation_issued (meaningless)
df = df.drop('citation_issued')
# drop column warning_issued (meaningless)
df = df.drop('warning_issued')

### DROP FOR NOW OPTMIZATIONS

# drop column outcome
df = df.drop('outcome')
# drop column vehicle_make
df = df.drop('vehicle_make')
# drop column vehicle_model
df = df.drop('vehicle_model')
# drop column vehicle_type
df = df.drop('vehicle_type')
# drop column violation
# df = df.drop('violation')

24/05/10 16:13:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

##### different values for violation and their counts

In [15]:
# show different values in type column and their counts
df.select('violation').groupBy('violation').count().show()



+--------------------+------+
|           violation| count|
+--------------------+------+
|Following Too Clo...|176546|
|Drive On Improved...|  1427|
|Fail To Signal La...|  1164|
|Cut In After Pass...|   354|
|Following Too Clo...|   987|
|Fail To Display D...|  2295|
|Drive in Left Lan...|    26|
|Damaged/discolore...|     2|
|Drive/Permit to D...|    29|
|Disregard RR Cros...|    39|
|Wrong Side Road-n...|   413|
|Fail To Report Ch...|   511|
|Minor Possesses A...|   688|
|No Valid Inspecti...|     1|
|Speeding Over Lim...|   235|
|Ride, Not Secured...|   304|
|"No/Improper Mud ...|    69|
|Damaged/discolore...|     1|
|Speeding Over Lim...|   150|
|Speeding Over Lim...|    20|
+--------------------+------+
only showing top 20 rows



                                                                                

##### parse dataset to open in pandas

In [16]:
from pyspark.sql.functions import col, count, when, isnan, isnull
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F

# Assuming df is your DataFrame and 'violation' is the column to parse
df = df.withColumn('violation', 
                   F.when(F.lower(F.col('violation')).like('%speed%'), 0)
                    .otherwise(1))

# make subject_sex 1 if 'male' and 0 if 'female'
df = df.withColumn("subject_sex", when(col("subject_sex") == "male", 1).otherwise(0).cast("integer"))

# make lat and long float
df = df.withColumn("lat", col("lat").cast("float"))
df = df.withColumn("lng", col("lng").cast("float"))

# make subject_race 0 if 'white', 1 if 'black', 2 if 'hispanic', 3 if 'asian', 4 if 'other' and make it an integer column
df = df.withColumn("subject_race", when(col("subject_race") == "white", 0)
                                   .when(col("subject_race") == "black", 1)
                                   .when(col("subject_race") == "hispanic", 2)
                                   .when(col("subject_race") == "asian", 3)
                                   .otherwise(4).cast("integer"))


# make search_vehicle 1 if TRUE and 0 if FALSE else NA
df = df.withColumn("search_vehicle", when(col("search_vehicle") == "TRUE", 1)
                                    .when(col("search_vehicle") == "FALSE", 0)
                                    .otherwise(None).cast("integer"))

# make vehicle_year an integer column and fill NA with 0
df = df.withColumn("vehicle_year", col("vehicle_year").cast("integer"))
df = df.withColumn("vehicle_year", when(col("vehicle_year").isNull(), 0).otherwise(col("vehicle_year")))
df = df.withColumn("vehicle_year", when(col("vehicle_year") < 1900, 0).otherwise(col("vehicle_year")))
df = df.withColumn("vehicle_year", when(col("vehicle_year") > 2022, 0).otherwise(col("vehicle_year")))

# date column is of format 'yyyy-mm-dd' and time column is of format 'hh:mm:ss': combine them into a single timestamp column
from pyspark.sql.functions import to_timestamp, concat_ws

df = df.withColumn("timestamp", to_timestamp(concat_ws(" ", col("date"), col("time")), "yyyy-MM-dd HH:mm:ss"))

# drop date and time columns
df = df.drop("date", "time")

# make search_conducted 1 if TRUE or citation and 0 if FALSE else NA and make it an integer column
from pyspark.sql.functions import col, when, to_timestamp, concat_ws

df = df.withColumn("search_conducted", when((col("search_conducted") == "TRUE") | (col("search_conducted") == "citation"), 1)
                                      .when(col("search_conducted") == "FALSE", 0)
                                      .otherwise(None).cast("integer"))

In [17]:
df.show()

+--------------------+---------+----------+----------------+------------+-----------+---------+----------------+--------------+------------+-------------------+
|            location|      lat|       lng|     county_name|subject_race|subject_sex|violation|search_conducted|search_vehicle|vehicle_year|          timestamp|
+--------------------+---------+----------+----------------+------------+-----------+---------+----------------+--------------+------------+-------------------+
|route: 0059, mile...|     NULL|      NULL|     Cass County|           0|          1|        0|               0|             0|        1997|2012-03-27 22:10:00|
|route: 0020, mile...|     NULL|      NULL|   Parker County|           0|          1|        1|               0|             0|        2002|2012-03-27 22:11:00|
|route: 0044, mile...|33.958683|-98.529686|  Wichita County|           0|          1|        1|               0|             0|        2000|2012-03-27 22:11:00|
|route: 1788, mile...|     NULL|  

In [18]:
df.dtypes

[('location', 'string'),
 ('lat', 'float'),
 ('lng', 'float'),
 ('county_name', 'string'),
 ('subject_race', 'int'),
 ('subject_sex', 'int'),
 ('violation', 'int'),
 ('search_conducted', 'int'),
 ('search_vehicle', 'int'),
 ('vehicle_year', 'int'),
 ('timestamp', 'timestamp')]

In [21]:
# intermediary save
df.write.format('parquet').mode('overwrite').save('../data/tx_statewide_2020_04_01-002_clean.parquet')

24/05/10 16:14:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/05/10 16:14:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/05/10 16:14:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/05/10 16:14:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/05/10 16:14:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/05/10 16:14:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
24/05/10 16:14:02 WARN MemoryManager: Total allocation exceeds 95.

Py4JError: py4j does not exist in the JVM

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/lux/.pyenv/versions/3.11.6/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/lux/.pyenv/versions/3.11.6/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lux/.pyenv/versions/3.11.6/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


_____

In [None]:
pip install pyarrow>=4.0.0

zsh:1: 4.0.0 not found
Note: you may need to restart the kernel to use updated packages.


# Save for Pandas VIS

In [None]:
from pyspark.sql import SparkSession
import os
import pandas as pd
from functools import partial

# Initialize Spark session
spark = SparkSession.builder \
    .appName("parsing") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Read the parquet file
df = spark.read.format('parquet').load('../data/tx_statewide_2020_04_01-002_clean.parquet')

# remove the file if it exists
if os.path.exists('../data/tx_statewide_2020_04_01-002_clean.csv'):
    os.remove('../data/tx_statewide_2020_04_01-002_clean.csv')

# Function to append a Pandas DataFrame to a CSV file
def append_to_csv(pandas_df, filename, header=True, index=False):
    pandas_df.to_csv(filename, mode='a', header=header, index=index)

# Adjusted function to accept column names
def write_partition_to_csv(column_names, iterator):
    pandas_df = pd.DataFrame(list(iterator), columns=column_names)
    if not pandas_df.empty:
        append_to_csv(pandas_df, '../data/tx_statewide_2020_04_01-002_clean.csv', header=not os.path.exists('../data/tx_statewide_2020_04_01-002_clean.csv'), index=False)

# Capture column names outside the function
column_names = df.columns

# Use partial to pass column names along with the iterator
df.foreachPartition(partial(write_partition_to_csv, column_names))

24/05/10 16:09:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

### Cleaning the dataset for ML

In [1]:
from pyspark.sql import SparkSession
import os
import pandas as pd
from functools import partial

# Initialize Spark session
spark = SparkSession.builder \
    .appName("parsing") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Read the csv file
df = spark.read.format('csv').option('header', 'true').load('../data/tx_statewide_2020_04_01-002_clean.csv')

# show the DataFrame
df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/10 16:48:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+--------------------+------------------+-------------------+----------------+------------+-----------+---------+----------------+--------------+------------+-------------------+
|            location|               lat|                lng|     county_name|subject_race|subject_sex|violation|search_conducted|search_vehicle|vehicle_year|          timestamp|
+--------------------+------------------+-------------------+----------------+------------+-----------+---------+----------------+--------------+------------+-------------------+
|route: 0010, mile...|29.650999069213867| -97.51920318603516| Gonzales County|           4|          1|        0|               0|           0.0|           0|2008-08-12 13:46:00|
|route: 0010, mile...| 29.65060043334961| -97.50606536865234| Gonzales County|           4|          1|        1|               0|           0.0|           0|2008-08-12 13:46:00|
|route: 0271, mile...|33.733150482177734| -95.54741668701172|    Lamar County|           0|          1|  

# Save for Pandas ML

In [2]:
from pyspark.sql import SparkSession
import os
import pandas as pd
from functools import partial

# remove the file if it exists
if os.path.exists('../data/tx_statewide_2020_04_01-002_clean_ml.csv'):
    os.remove('../data/tx_statewide_2020_04_01-002_clean_ml.csv')

# Function to append a Pandas DataFrame to a CSV file
def append_to_csv(pandas_df, filename, header=True, index=False):
    pandas_df.to_csv(filename, mode='a', header=header, index=index)

# Adjusted function to accept column names
def write_partition_to_csv(column_names, iterator):
    pandas_df = pd.DataFrame(list(iterator), columns=column_names)
    if not pandas_df.empty:
        append_to_csv(pandas_df, '../data/tx_statewide_2020_04_01-002_clean_ml.csv', header=not os.path.exists('../data/tx_statewide_2020_04_01-002_clean_ml.csv'), index=False)

# Capture column names outside the function
column_names = df.columns

# Use partial to pass column names along with the iterator
df.foreachPartition(partial(write_partition_to_csv, column_names))

24/05/10 16:48:47 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                