In [1]:
# Import necessary libraries
import pandas as pd
from pyproj import Proj, Transformer
from pathlib import Path
import warnings

In [2]:
# Ignore FutureWarnings from pyproj
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Read the CSV file into a Pandas DataFrame
arrest_2020 = pd.read_csv('data/2020_Racial_Profiling__RP__dataset_20231109.csv')

# Filter records for 'Arrests' in the 'Type' column
arrests_2020 = arrest_2020[arrest_2020['Type'] == 'Arrests']

# Display the resulting DataFram
arrests_2020.to_csv('arrests_ONLY_2020_.csv', index=False)


In [4]:
# Define the desired column order
desired_column_order = ['arrest_date', 'arrest_time', 'sex', 'race', 'searched', 'reason_stopped', 'search_based_on',
                        'search_found', 'race_known', 'XC', 'yC']
# Function to process the DataFrame for a specific year
def process_dataframe(df, year):
    # Create a copy of the DataFrame
    df_copy = df.copy()

    # Drop specified columns based on the year
    drop_columns = {
        2015: ['PRIMARY_KEY', 'LOCATION', 'SECTOR', 'LOCAL_FIELD1'],
        2016: ['PRIMARY_KEY', 'LOCATION', 'SECTOR', 'LOCAL_FIELD1'],
        2018: ['PRIMARY_KEY', 'LOCATION', 'APD_sector', 'CouncilDistrict', 'county_description', 'ZIP', 'CENSUS_TRACT'],
        2019: ['PRIMARY_KEY', 'LOCATION', 'APD_sector', 'CouncilDistrict', 'county_description', 'ZIP', 'CENSUS_TRACT'],
        2020: ['Stop Key', 'Type', 'TCOLE RACE ETHNICITY', 'Street_Type', 'TCOLE Result of Stop',
               'TCOLE Arrest Based On', 'CENSUS_TRACT', 'Council District', 'COUNTY', 'Custody', 'Location', 'Sector', 'Zip Code']
    }
    df_copy = df_copy.drop(columns=drop_columns.get(year, []), errors='ignore')

    # Rename columns based on the year
    rename_columns = {
        2015: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'SEX': 'sex', 'APD_RACE_DESC': 'race',
               'PERSON_SEARCHED_DESC': 'searched', 'REASON_FOR_STOP_DESC': 'reason_stopped',
               'SEARCH_BASED_ON_DESC': 'search_based_on', 'SEARCH_DISC_DESC': 'search_found', 'RACE_KNOWN': 'race_known',
               'X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2016: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'SEX': 'sex', 'APD_RACE_DESC': 'race',
               'PERSON_SEARCHED_DESC': 'searched', 'REASON_FOR_STOP_DESC': 'reason_stopped',
               'SEARCH_BASED_ON_DESC': 'search_based_on', 'SEARCH_DISC_DESC': 'search_found', 'RACE_KNOWN': 'race_known',
              'X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2018: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC', 'RACE_KNOWN': 'race_known', 'Reason for Stop – TCOLE form': 'reason_stopped',
               'SEX': 'sex', 'APD_RACE_DESC': 'race', 'Person Search YN': 'searched', 'Search Based On': 'search_based_on',
               'Search Found': 'search_found','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2019: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'X_COORDINATE': 'XC', 'Y_COORDINATE':'yC','RACE_KNOWN': 'race_known', 'Reason for Stop – TCOLE form': 'reason_stopped',
               'SEX': 'sex', 'APD_RACE_DESC': 'race', 'Person Search YN': 'searched', 'Search Based On': 'search_based_on',
               'Search Found': 'search_found','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2020: {'TCOLE Sex': 'sex', 'Standardized Race Known': 'race_known',
               'Reason for Stop': 'reason_stopped', 'Search Yes or No': 'searched',
               'TCOLE Search Based On': 'search_based_on', 'TCOLE Search Found': 'search_found',
               'Standardized Race': 'race', 'Stop Date': 'arrest_date', 'Stop Time': 'arrest_time','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
    }
    df_copy = df_copy.rename(columns=rename_columns.get(year, {}))

    for col in ['XC', 'yC']:
        if col in df_copy.columns:
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
            # Convert columns to the appropriate data type
            df_copy[col] = df_copy[col].astype('float64')
            # Fill NaN values with the mean
            df_copy[col] = df_copy[col].fillna(df_copy[col].mean())

    


    # Replace missing or null values with "unknown"
    columns_to_replace = ['sex', 'race', 'reason_stopped', 'search_based_on', 'search_found', 'race_known', 'searched']
    df_copy[columns_to_replace] = df_copy[columns_to_replace].fillna('not_listed')

    # Convert columns to appropriate data types
    df_copy['arrest_date'] = pd.to_datetime(df_copy['arrest_date'])
    # df_copy['arrest_time'] = pd.to_datetime(df_copy['arrest_time'], errors='coerce')
    df_copy['sex'] = df_copy['sex'].astype('category')
    df_copy['race'] = df_copy['race'].astype('category')
    df_copy['searched'] = df_copy['searched'].astype('category')
    df_copy['reason_stopped'] = df_copy['reason_stopped'].astype('category')
    df_copy['search_based_on'] = df_copy['search_based_on'].astype('category')
    df_copy['search_found'] = df_copy['search_found'].astype('category')
    df_copy['race_known'] = df_copy['race_known'].astype('category')

    # Ensure that desired columns are present in the DataFrame
    df_copy = df_copy[desired_column_order] if all(col in df_copy.columns for col in desired_column_order) else df_copy

    return df_copy



# Dictionary to store DataFrames
dfs = {}

# List of CSV file paths
file_paths = {
    2015: "data/2015_Racial_Profiling_Arrests_20231113.csv",
    2016: "data/2016_RP_Arrests_20231113.csv",
    2018: "data/2018_RP_Arrests.csv",
    2019: "data/2019_Racial_Profiling__RP__Arrests_20231113.csv",
    2020: "data/arrests_ONLY_2020_.csv",
}

# Loop through each file, read data, and add DataFrame to the dictionary
for year, file_path in file_paths.items():
    # Read the CSV file into a Pandas DataFrame
    profiling_df = pd.read_csv(Path(file_path))
    
    # Process the DataFrame for the specific year
    processed_df = process_dataframe(profiling_df, year)

    # Add the processed DataFrame to the dictionary
    dfs[year] = processed_df

    # Check data types
    print(f"\nDTYPES FOR {year} DF:\n")
    print(processed_df.dtypes)



DTYPES FOR 2015 DF:

arrest_date        datetime64[ns]
arrest_time                 int64
sex                      category
race                     category
searched                 category
reason_stopped           category
search_based_on          category
search_found             category
race_known               category
XC                        float64
yC                        float64
dtype: object

DTYPES FOR 2016 DF:

arrest_date        datetime64[ns]
arrest_time                 int64
sex                      category
race                     category
searched                 category
reason_stopped           category
search_based_on          category
search_found             category
race_known               category
XC                        float64
yC                        float64
dtype: object

DTYPES FOR 2018 DF:

arrest_date        datetime64[ns]
arrest_time                 int64
sex                      category
race                     category
searched              

In [5]:
import pyproj

# Function to convert XC (longitude) and yC (latitude) to actual longitude and latitude
def convert_coordinates_to_latlon(df):
    if 'XC' in df.columns and 'yC' in df.columns:
        # Define the projection for Austin, Texas (you may need to adjust this based on your data)
        austin_proj = pyproj.Proj(init='epsg:6578')  # Assuming EPSG 6578 for Austin

        # Convert XC and yC to longitude and latitude
        df['lng'], df['lat'] = austin_proj(df['XC'].values, df['yC'].values, inverse=True)

        # Drop the original XC and yC columns
        df = df.drop(['XC', 'yC'], axis=1)

    return df

# Loop through each DataFrame and convert coordinates
for year, df in dfs.items():
    dfs[year] = convert_coordinates_to_latlon(df)

# Display the first few rows of each DataFrame after the conversion
for year, df in dfs.items():
    print(f"\nDF for the year {year}:\n")
    display(df.head(3))


DF for the year 2015:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2015-01-01,2,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.734192,30.266469
1,2015-01-01,317,M,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663
2,2015-01-01,317,F,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663



DF for the year 2016:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2016-01-01,2355,F,WHITE,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.721807,30.391166
1,2016-01-02,123,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,PROBABLE CAUSE,OTHER,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.787087,30.210685
2,2016-01-02,123,M,HISPANIC OR LATINO,NO = 2,not_listed,not_listed,not_listed,not_listed,-97.787087,30.210685



DF for the year 2018:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2018-03-26,2010,F,WHITE,YES,Violation of law other than traffic,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.753825,30.26024
1,2018-12-04,1032,M,BLACK,NO,Moving Traffic Violation,not_listed,not_listed,not_listed,-97.742273,30.276663
2,2018-01-22,930,F,BLACK,NO,Moving Traffic Violation,not_listed,not_listed,not_listed,-97.694703,30.348283



DF for the year 2019:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2019-07-17,338,M,WHITE,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.790452,30.227928
1,2019-07-27,2319,M,WHITE,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.766423,30.224811
2,2019-03-17,303,F,BLACK,YES = 1,Violation of law other than traffic,INCIDENTAL TO ARREST,CASH,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.723585,30.288333



DF for the year 2020:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2020-03-08,2242,F,ASIAN,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.709511,30.281767
1,2020-07-12,118,F,ASIAN,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.774612,30.205007
2,2020-08-22,53,F,ASIAN,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,OTHER,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.750519,30.216603


In [6]:
# Define the directory to save the CSV files
output_directory = "data/cleaned_data"
# "C:\github_Repos\Predictive-Racial-Profiling\data\cleaned_data"
# Loop through each DataFrame, save it to a CSV file
for year, df in dfs.items():
    # Construct the file path
    output_file_path = Path(output_directory, f"{year}_cleaned_data.csv")

    # Save the DataFrame to a CSV file
    df.to_csv(output_file_path, index=False)

    print(f"DataFrame for the year {year} saved to: {output_file_path}")


DataFrame for the year 2015 saved to: data\cleaned_data\2015_cleaned_data.csv
DataFrame for the year 2016 saved to: data\cleaned_data\2016_cleaned_data.csv
DataFrame for the year 2018 saved to: data\cleaned_data\2018_cleaned_data.csv
DataFrame for the year 2019 saved to: data\cleaned_data\2019_cleaned_data.csv
DataFrame for the year 2020 saved to: data\cleaned_data\2020_cleaned_data.csv


In [7]:
# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dfs.values(), ignore_index=True)

# Display the first few rows of the combined DataFrame
print("\nCombined DataFrame:\n")
display(combined_df.head(3))

# Save the combined DataFrame to a CSV file
combined_output_path = Path(output_directory, "combined_cleaned_data.csv")
combined_df.to_csv(combined_output_path, index=False)

print(f"\nCombined DataFrame saved to: {combined_output_path}")


Combined DataFrame:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2015-01-01,2,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.734192,30.266469
1,2015-01-01,317,M,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663
2,2015-01-01,317,F,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663



Combined DataFrame saved to: data\cleaned_data\combined_cleaned_data.csv
