!pip install pyspark

In [33]:
import os
import shutil
from pathlib import Path
from pyspark import SparkContext
from pyspark.sql import functions
from pyspark.sql.window import Window
from pyspark.sql.session import SparkSession
from concurrent.futures import ThreadPoolExecutor

In [2]:
SPARK_CONTEXT =SparkContext()
SPARK_SESSION =SparkSession(SPARK_CONTEXT)
DATA_DIR =Path(__name__).resolve().parent /'data'

**1. Read Dataset**

In [3]:
def read_csv(path:str)->tuple:
    df =SPARK_SESSION.read.csv(path, inferSchema =True, header =True)
    return path.split('_')[3], df

In [4]:
def read_data(DIR =DATA_DIR):
    dfs, paths ={}, []
    for root, _, files in os.walk(DIR):
        for file in files:
            path =os.path.join(root, file)
            if os.path.exists(path): paths.append(path)
    with ThreadPoolExecutor() as executor:
        jobs = [executor.submit(read_csv, path) for path in paths]
        for job in jobs:
            year, df =job.result()
            dfs[year] =df
    return dfs

In [100]:
datasets =read_data()

**2. Data Exploration and Summary**

In [101]:
def explore_dfs(dfs:dict):
    NULL_FIELDS, dfc =[], []
    title ="Year\t\tColumns\t\tRows\t\tNull Columns\t\tNull Fields"
    print(title)
    print("=="*len(title))
    for year, df in dfs.items():
        cols =df.columns
        dfc =dict(df.dtypes)
        col_nulls_ct =df.select([functions.sum(functions.col(col).isNull().cast("int")).alias(col) for col in df.columns])
        cols_with_nulls = col_nulls_ct.columns
        cols_with_nulls = [col for col in cols_with_nulls if col_nulls_ct.select(functions.col(col)).head()[0] > 0]
        cols_with_nulls= col_nulls_ct.select(cols_with_nulls)
        null_counts_dict = cols_with_nulls.first().asDict()
        total_cols =len(cols)
        null_fields =sum([val for val in null_counts_dict.values()])
        total_fields =df.count() * total_cols
        percantage_nulls =(null_fields/total_fields) *100
        print(f"{year}\t\t{total_cols}\t\t{total_fields}\t\t{len(null_counts_dict)}\t\t\t{null_fields}/{total_fields} - ({percantage_nulls:.2f}%)")
    
    print("\nColumns")
    print("==="*len(title))
    print(dfc)

In [102]:
explore_dfs(datasets)

Year		Columns		Rows		Null Columns		Null Fields
2009		23		9063817		22			5720513/9063817 - (63.11%)
2010		23		9279810		22			5857853/9279810 - (63.12%)
2011		23		8726131		22			5510868/8726131 - (63.15%)
2012		23		8894629		22			5622606/8894629 - (63.21%)
2013		23		8635419		21			5746671/8635419 - (66.55%)
2014		23		8398519		22			5412128/8398519 - (64.44%)
2015		23		8294053		22			5371371/8294053 - (64.76%)
2016		23		8707271		22			5672925/8707271 - (65.15%)
2017		23		8230481		22			5139314/8230481 - (62.44%)
2018		23		8070631		22			5049805/8070631 - (62.57%)
2019		23		6011096		22			3702927/6011096 - (61.60%)
2020		23		5843817		22			3645348/5843817 - (62.38%)
2021		23		2118921		15			226414/2118921 - (10.69%)
2022		23		2342504		9			166395/2342504 - (7.10%)
2023		23		2038375		13			199552/2038375 - (9.79%)

Columns
{'Report_No': 'string', 'Reported_Date': 'string', 'Reported_Time': 'timestamp', 'From_Date': 'string', 'From_Time': 'timestamp', 'To_Date': 'string', 'To_Time': 'timestamp', 'Offense':

**3. Handling missing Values**

In [123]:
df_clean ={}

In [124]:
def clean_dataset(dfs:dict):
    fill_with_mean =['Age']
    fill_with_o =['Race', 'Sex', 'Firearm_Used_Flag']
    fill_with_others =['City', 'Involvement' ,'Description', 'Area']
    for year, df in dfs.items(): 
        df_clean[year] =df.select([*fill_with_others, *fill_with_mean, *fill_with_o])
        df_clean[year] =df_clean[year].na.fill('Other', subset =fill_with_others)
        df_clean[year] =df_clean[year].withColumn('Firearm_Used_Flag',
            functions.when(
            (functions.lower(functions.col("Firearm_Used_Flag")) == "f") | 
            (functions.lower(functions.col("Firearm_Used_Flag")) == "false") | 
            (functions.lower(functions.col("Firearm_Used_Flag")) == "n") | 
            (functions.lower(functions.col("Firearm_Used_Flag")) == "no"), False)
            .when(
            (functions.lower(functions.col("Firearm_Used_Flag")) == "t") | 
            (functions.lower(functions.col("Firearm_Used_Flag")) == "true") | 
            (functions.lower(functions.col("Firearm_Used_Flag")) == "y") | 
            (functions.lower(functions.col("Firearm_Used_Flag")) == "yes"), True)
            .otherwise(False)                                     
        )
        df_clean[year] =df_clean[year].withColumn('Sex',
            functions.when(
                (functions.lower(functions.col("Sex")) =='f') |
                (functions.lower(functions.col("Sex")) =='female'), 'Female')
            .when(
                (functions.lower(functions.col("Sex")) == 'm')|
                (functions.lower(functions.col("Sex")) == 'male'), 'Male')
            .otherwise('Other')                                     
        )

        df_clean[year] =df_clean[year].withColumn('Race',
            functions.when( (functions.lower(functions.col("Race")) =='w'), 'White')
            .when((functions.lower(functions.col("Race")) == 'b'), 'Black')
            .otherwise('Other')                                     
        )
        
        for col in fill_with_mean: 
            df_clean[year] =df_clean[year].withColumn(col, functions.when(df_clean[year][col].isNull(), df_clean[year].select(functions.mean(col)).collect()[0][0]).otherwise(df_clean[year][col]))
        df_clean[year] =df_clean[year].na.drop(how ='any')


In [125]:
clean_dataset(datasets)

**4. Data Transformation**

In [126]:
def transform_dataset(dfs:dict):
    for year, df in dfs.items():
        df_clean[year] =df_clean[year].withColumn("Age", functions.col("Age").cast("int"))

In [127]:
transform_dataset(df_clean)

In [128]:
explore_dfs(df_clean)

Year		Columns		Rows		Null Columns		Null Fields
2009		8		3152632		0			0/3152632 - (0.00%)
2010		8		3227760		0			0/3227760 - (0.00%)
2011		8		3035176		0			0/3035176 - (0.00%)
2012		8		3093784		0			0/3093784 - (0.00%)
2013		8		3003624		0			0/3003624 - (0.00%)
2014		8		2921224		0			0/2921224 - (0.00%)
2015		8		2884888		0			0/2884888 - (0.00%)
2016		8		3028616		0			0/3028616 - (0.00%)
2017		8		2862776		0			0/2862776 - (0.00%)
2018		8		2807176		0			0/2807176 - (0.00%)
2019		8		2090816		0			0/2090816 - (0.00%)
2020		8		2032632		0			0/2032632 - (0.00%)
2021		8		737016		0			0/737016 - (0.00%)
2022		8		814784		0			0/814784 - (0.00%)
2023		8		709000		0			0/709000 - (0.00%)

Columns
{'City': 'string', 'Involvement': 'string', 'Description': 'string', 'Area': 'string', 'Age': 'int', 'Race': 'string', 'Sex': 'string', 'Firearm_Used_Flag': 'boolean'}


In [131]:
def write_csv(dfs:dict):
    try:
        if os.path.exists('out'): shutil.rmtree('out')
        else: os.mkdir('out')
        for year, df in dfs.items(): 
            print(f"Saving {year} dataset")
            df.write.options(header='True', delimiter=',').csv(f"out/{year}")
    except Exception: pass

In [132]:
write_csv(df_clean)

Saving 2009 dataset
Saving 2010 dataset
Saving 2011 dataset
Saving 2012 dataset
Saving 2013 dataset
Saving 2014 dataset
Saving 2015 dataset
Saving 2016 dataset
Saving 2017 dataset
Saving 2018 dataset
Saving 2019 dataset
Saving 2020 dataset
Saving 2021 dataset
Saving 2022 dataset
Saving 2023 dataset
