In [1]:
# Import necessary libraries
import pandas as pd
from pyproj import Proj, Transformer
from pathlib import Path
import warnings
import os

In [2]:
# Ignore FutureWarnings from pyproj
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Read the CSV file into a Pandas DataFrame
arrest_2020 = pd.read_csv('data/2020_Racial_Profiling__RP__dataset_20231109.csv')

# Filter records for 'Arrests' in the 'Type' column
arrests_2020 = arrest_2020[arrest_2020['Type'] == 'Arrests']

# Display the resulting DataFram
arrests_2020.to_csv('data/arrests_ONLY_2020_.csv', index=False)


In [13]:
import pandas as pd
import pyproj
from pathlib import Path
from IPython.display import display

# Define the desired column order
desired_column_order = ['arrest_date', 'arrest_time', 'sex', 'race', 'searched', 'reason_stopped', 'search_based_on',
                        'search_found', 'race_known', 'lng', 'lat']

# Function to process the DataFrame for a specific year
def process_dataframe(df, year):
    # Create a copy of the DataFrame
    df_copy = df.copy()

    # Drop specified columns based on the year
    drop_columns = {
        2015: ['PRIMARY_KEY', 'LOCATION', 'SECTOR', 'LOCAL_FIELD1'],
        2016: ['PRIMARY_KEY', 'LOCATION', 'SECTOR', 'LOCAL_FIELD1'],
        2018: ['PRIMARY_KEY', 'LOCATION', 'APD_sector', 'CouncilDistrict', 'county_description', 'ZIP', 'CENSUS_TRACT'],
        2019: ['PRIMARY_KEY', 'LOCATION', 'APD_sector', 'CouncilDistrict', 'county_description', 'ZIP', 'CENSUS_TRACT'],
        2020: ['Stop Key', 'Type', 'TCOLE RACE ETHNICITY', 'Street_Type', 'TCOLE Result of Stop',
               'TCOLE Arrest Based On', 'CENSUS_TRACT', 'Council District', 'COUNTY', 'Custody', 'Location', 'Sector', 'Zip Code']
    }
    df_copy = df_copy.drop(columns=drop_columns.get(year, []), errors='ignore')

    # Rename columns based on the year
    rename_columns = {
        2015: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'SEX': 'sex', 'APD_RACE_DESC': 'race',
               'PERSON_SEARCHED_DESC': 'searched', 'REASON_FOR_STOP_DESC': 'reason_stopped',
               'SEARCH_BASED_ON_DESC': 'search_based_on', 'SEARCH_DISC_DESC': 'search_found', 'RACE_KNOWN': 'race_known',
               'X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2016: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'SEX': 'sex', 'APD_RACE_DESC': 'race',
               'PERSON_SEARCHED_DESC': 'searched', 'REASON_FOR_STOP_DESC': 'reason_stopped',
               'SEARCH_BASED_ON_DESC': 'search_based_on', 'SEARCH_DISC_DESC': 'search_found', 'RACE_KNOWN': 'race_known',
              'X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2018: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC', 'RACE_KNOWN': 'race_known', 'Reason for Stop – TCOLE form': 'reason_stopped',
               'SEX': 'sex', 'APD_RACE_DESC': 'race', 'Person Search YN': 'searched', 'Search Based On': 'search_based_on',
               'Search Found': 'search_found','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2019: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'X_COORDINATE': 'XC', 'Y_COORDINATE':'yC','RACE_KNOWN': 'race_known', 'Reason for Stop – TCOLE form': 'reason_stopped',
               'SEX': 'sex', 'APD_RACE_DESC': 'race', 'Person Search YN': 'searched', 'Search Based On': 'search_based_on',
               'Search Found': 'search_found','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
        2020: {'TCOLE Sex': 'sex', 'Standardized Race Known': 'race_known',
               'Reason for Stop': 'reason_stopped', 'Search Yes or No': 'searched',
               'TCOLE Search Based On': 'search_based_on', 'TCOLE Search Found': 'search_found',
               'Standardized Race': 'race', 'Stop Date': 'arrest_date', 'Stop Time': 'arrest_time','X_COORDINATE': 'XC', 'Y_COORDINATE': 'yC'},
    }
    df_copy = df_copy.rename(columns=rename_columns.get(year, {}))
    

    for col in ['XC', 'yC']:
        if col in df_copy.columns:
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
            # Convert columns to the appropriate data type
            df_copy[col] = df_copy[col].astype('float64')
            # Fill NaN values with the mean
            df_copy[col] = df_copy[col].fillna(df_copy[col].mean())

    


    # Replace missing or null values with "unknown"
    columns_to_replace = ['sex', 'race', 'reason_stopped', 'search_based_on', 'search_found', 'race_known', 'searched']
    df_copy[columns_to_replace] = df_copy[columns_to_replace].fillna('not_listed')

    # Convert columns to appropriate data types
    df_copy['arrest_date'] = pd.to_datetime(df_copy['arrest_date'])
    df_copy['arrest_time'] = df_copy['arrest_time'].astype(str).str.zfill(4)
    df_copy['arrest_time'] = pd.to_timedelta(df_copy['arrest_time'].str.zfill(4).str[:2] + ':' + df_copy['arrest_time'].str[2:] + ':00')
    df_copy['arrest_time'] = df_copy['arrest_date'] + df_copy['arrest_time']
    df_copy['sex'] = df_copy['sex'].astype('category')
    df_copy['race'] = df_copy['race'].astype('category')
    df_copy['searched'] = df_copy['searched'].astype('category')
    df_copy['reason_stopped'] = df_copy['reason_stopped'].astype('category')
    df_copy['search_based_on'] = df_copy['search_based_on'].astype('category')
    df_copy['search_found'] = df_copy['search_found'].astype('category')
    df_copy['race_known'] = df_copy['race_known'].astype('category')

    # Ensure that desired columns are present in the DataFrame
    df_copy = df_copy[desired_column_order] if all(col in df_copy.columns for col in desired_column_order) else df_copy

    return df_copy

# Function to convert XC (longitude) and yC (latitude) to actual longitude and latitude
def convert_coordinates_to_latlon(df):
    if 'XC' in df.columns and 'yC' in df.columns:
        # Define the projection for Austin, Texas (you may need to adjust this based on your data)
        austin_proj = pyproj.Proj(init='epsg:6578')  # Assuming EPSG 6578 for Austin

        # Convert XC and yC to longitude and latitude
        df['lng'], df['lat'] = austin_proj(df['XC'].values, df['yC'].values, inverse=True)

        # Drop the original XC and yC columns
        df = df.drop(['XC', 'yC'], axis=1)

    return df

def replace_null_with_mean(df):
    df['lat'] = df['lat'].fillna(df['lat'].mean())
    df['lng'] = df['lng'].fillna(df['lng'].mean())
    return df

# Loop through each DataFrame and replace null values in 'lat' and 'lng' with the mean
for year, processed_df in dfs.items():
    dfs[year] = replace_null_with_mean(processed_df)

# Dictionary to store DataFrames
dfs = {}

# List of CSV file paths
file_paths = {
    2015: "data/2015_Racial_Profiling_Arrests_20231113.csv",
    2016: "data/2016_RP_Arrests_20231113.csv",
    2018: "data/2018_RP_Arrests.csv",
    2019: "data/2019_Racial_Profiling__RP__Arrests_20231113.csv",
    2020: "data/arrests_ONLY_2020_.csv",
}

# Loop through each file, read data, and add DataFrame to the dictionary
for year, file_path in file_paths.items():
    # Read the CSV file into a Pandas DataFrame
    profiling_df = pd.read_csv(Path(file_path))
    
    # Process the DataFrame for the specific year
    processed_df = process_dataframe(profiling_df, year)

    # Convert coordinates to latitude and longitude
    processed_df = convert_coordinates_to_latlon(processed_df)

    # Add the processed DataFrame to the dictionary
    dfs[year] = processed_df

    # Check data types
    print(f"\nDTYPES FOR {year} DF:\n")
    print(processed_df.dtypes)

# Display the first few rows of each DataFrame after the conversion
for year, processed_df in dfs.items():
    print(f"\nDF for the year {year}:\n")
    display(processed_df.head(3))



DTYPES FOR 2015 DF:

arrest_date        datetime64[ns]
arrest_time        datetime64[ns]
sex                      category
race                     category
searched                 category
reason_stopped           category
search_based_on          category
search_found             category
race_known               category
lng                       float64
lat                       float64
dtype: object

DTYPES FOR 2016 DF:

arrest_date        datetime64[ns]
arrest_time        datetime64[ns]
sex                      category
race                     category
searched                 category
reason_stopped           category
search_based_on          category
search_found             category
race_known               category
lng                       float64
lat                       float64
dtype: object

DTYPES FOR 2018 DF:

arrest_date        datetime64[ns]
arrest_time        datetime64[ns]
race_known               category
reason_stopped           category
sex                   

Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2015-01-01,2015-01-01 00:02:00,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.734192,30.266469
1,2015-01-01,2015-01-01 03:17:00,M,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663
2,2015-01-01,2015-01-01 03:17:00,F,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663



DF for the year 2016:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2016-01-01,2016-01-01 23:55:00,F,WHITE,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.721807,30.391166
1,2016-01-02,2016-01-02 01:23:00,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,PROBABLE CAUSE,OTHER,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.787087,30.210685
2,2016-01-02,2016-01-02 01:23:00,M,HISPANIC OR LATINO,NO = 2,not_listed,not_listed,not_listed,not_listed,-97.787087,30.210685



DF for the year 2018:



Unnamed: 0,arrest_date,arrest_time,race_known,reason_stopped,sex,race,searched,search_based_on,search_found,lng,lat
0,2018-03-26,2018-03-26 20:10:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Violation of law other than traffic,F,WHITE,YES,INCIDENTAL TO ARREST,NOTHING,-97.753825,30.26024
1,2018-12-04,2018-12-04 10:32:00,not_listed,Moving Traffic Violation,M,BLACK,NO,not_listed,not_listed,-97.742273,30.276663
2,2018-01-22,2018-01-22 09:30:00,not_listed,Moving Traffic Violation,F,BLACK,NO,not_listed,not_listed,-97.694703,30.348283



DF for the year 2019:



Unnamed: 0,arrest_date,arrest_time,race_known,reason_stopped,sex,race,searched,search_based_on,search_found,lng,lat
0,2019-07-17,2019-07-17 03:38:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,M,WHITE,YES = 1,INCIDENTAL TO ARREST,NOTHING,-97.790452,30.227928
1,2019-07-27,2019-07-27 23:19:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,M,WHITE,YES = 1,INCIDENTAL TO ARREST,NOTHING,-97.766423,30.224811
2,2019-03-17,2019-03-17 03:03:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Violation of law other than traffic,F,BLACK,YES = 1,INCIDENTAL TO ARREST,CASH,-97.723585,30.288333



DF for the year 2020:



Unnamed: 0,sex,race_known,reason_stopped,searched,search_based_on,search_found,race,arrest_date,arrest_time,lng,lat
0,F,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,YES = 1,INCIDENTAL TO ARREST,DRUGS,ASIAN,2020-03-08,2020-03-08 22:42:00,-97.709511,30.281767
1,F,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,YES = 1,INCIDENTAL TO ARREST,NOTHING,ASIAN,2020-07-12,2020-07-12 01:18:00,-97.774612,30.205007
2,F,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,YES = 1,INCIDENTAL TO ARREST,OTHER,ASIAN,2020-08-22,2020-08-22 00:53:00,-97.750519,30.216603


In [14]:
# Display null values for each DataFrame
for year, processed_df in dfs.items():
    print(f"\nDF for the year {year}:\n")
    display(processed_df.head(3))  # Display the first few rows
    print(f"\nNull values in the DataFrame for the year {year}:\n")
    display(processed_df.isnull().sum())


DF for the year 2015:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2015-01-01,2015-01-01 00:02:00,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.734192,30.266469
1,2015-01-01,2015-01-01 03:17:00,M,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663
2,2015-01-01,2015-01-01 03:17:00,F,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.777689,30.227663



Null values in the DataFrame for the year 2015:



arrest_date        0
arrest_time        0
sex                0
race               0
searched           0
reason_stopped     0
search_based_on    0
search_found       0
race_known         0
lng                0
lat                0
dtype: int64


DF for the year 2016:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2016-01-01,2016-01-01 23:55:00,F,WHITE,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.721807,30.391166
1,2016-01-02,2016-01-02 01:23:00,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,PROBABLE CAUSE,OTHER,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,-97.787087,30.210685
2,2016-01-02,2016-01-02 01:23:00,M,HISPANIC OR LATINO,NO = 2,not_listed,not_listed,not_listed,not_listed,-97.787087,30.210685



Null values in the DataFrame for the year 2016:



arrest_date        0
arrest_time        0
sex                0
race               0
searched           0
reason_stopped     0
search_based_on    0
search_found       0
race_known         0
lng                0
lat                0
dtype: int64


DF for the year 2018:



Unnamed: 0,arrest_date,arrest_time,race_known,reason_stopped,sex,race,searched,search_based_on,search_found,lng,lat
0,2018-03-26,2018-03-26 20:10:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Violation of law other than traffic,F,WHITE,YES,INCIDENTAL TO ARREST,NOTHING,-97.753825,30.26024
1,2018-12-04,2018-12-04 10:32:00,not_listed,Moving Traffic Violation,M,BLACK,NO,not_listed,not_listed,-97.742273,30.276663
2,2018-01-22,2018-01-22 09:30:00,not_listed,Moving Traffic Violation,F,BLACK,NO,not_listed,not_listed,-97.694703,30.348283



Null values in the DataFrame for the year 2018:



arrest_date        0
arrest_time        0
race_known         0
reason_stopped     0
sex                0
race               0
searched           0
search_based_on    0
search_found       0
lng                0
lat                0
dtype: int64


DF for the year 2019:



Unnamed: 0,arrest_date,arrest_time,race_known,reason_stopped,sex,race,searched,search_based_on,search_found,lng,lat
0,2019-07-17,2019-07-17 03:38:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,M,WHITE,YES = 1,INCIDENTAL TO ARREST,NOTHING,-97.790452,30.227928
1,2019-07-27,2019-07-27 23:19:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,M,WHITE,YES = 1,INCIDENTAL TO ARREST,NOTHING,-97.766423,30.224811
2,2019-03-17,2019-03-17 03:03:00,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Violation of law other than traffic,F,BLACK,YES = 1,INCIDENTAL TO ARREST,CASH,-97.723585,30.288333



Null values in the DataFrame for the year 2019:



arrest_date        0
arrest_time        0
race_known         0
reason_stopped     0
sex                0
race               0
searched           0
search_based_on    0
search_found       0
lng                0
lat                0
dtype: int64


DF for the year 2020:



Unnamed: 0,sex,race_known,reason_stopped,searched,search_based_on,search_found,race,arrest_date,arrest_time,lng,lat
0,F,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,YES = 1,INCIDENTAL TO ARREST,DRUGS,ASIAN,2020-03-08,2020-03-08 22:42:00,-97.709511,30.281767
1,F,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,YES = 1,INCIDENTAL TO ARREST,NOTHING,ASIAN,2020-07-12,2020-07-12 01:18:00,-97.774612,30.205007
2,F,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,Moving Traffic Violation,YES = 1,INCIDENTAL TO ARREST,OTHER,ASIAN,2020-08-22,2020-08-22 00:53:00,-97.750519,30.216603



Null values in the DataFrame for the year 2020:



sex                0
race_known         0
reason_stopped     0
searched           0
search_based_on    0
search_found       0
race               0
arrest_date        0
arrest_time        0
lng                0
lat                0
dtype: int64

In [16]:
# Iterate over the dictionary
for year, processed_df in dfs.items():
    # Print the year
    print(f"Unique values for {year}:")

    # Iterate over columns in the data frame
    for column in processed_df.columns:
        # Use unique() to get unique values in each column
        unique_values = processed_df[column].unique()
        
        # Print the column name and unique values
        print(f"{column}: {unique_values}")

    # Add a separator for clarity between years
    print("-" * 40)

Unique values for 2015:
arrest_date: <DatetimeArray>
['2015-01-01 00:00:00', '2015-01-02 00:00:00', '2015-01-03 00:00:00',
 '2015-01-04 00:00:00', '2015-01-06 00:00:00', '2015-01-05 00:00:00',
 '2015-04-11 00:00:00', '2015-01-07 00:00:00', '2015-04-24 00:00:00',
 '2015-01-08 00:00:00',
 ...
 '2015-12-22 00:00:00', '2015-12-23 00:00:00', '2015-12-24 00:00:00',
 '2015-12-25 00:00:00', '2015-12-26 00:00:00', '2015-12-27 00:00:00',
 '2015-12-28 00:00:00', '2015-12-29 00:00:00', '2015-12-30 00:00:00',
 '2015-12-31 00:00:00']
Length: 363, dtype: datetime64[ns]
arrest_time: <DatetimeArray>
['2015-01-01 00:02:00', '2015-01-01 03:17:00', '2015-01-01 04:26:00',
 '2015-01-01 01:06:00', '2015-01-01 05:26:00', '2015-01-01 02:01:00',
 '2015-01-01 17:45:00', '2015-01-01 02:21:00', '2015-01-01 02:29:00',
 '2015-01-01 02:36:00',
 ...
 '2015-12-31 21:51:00', '2015-12-31 21:38:00', '2015-12-31 21:45:00',
 '2015-12-31 21:57:00', '2015-12-31 23:02:00', '2015-12-31 23:00:00',
 '2015-12-31 22:28:00', '2015-1

In [15]:
# Save each DataFrame to a CSV file
for year, processed_df in dfs.items():
    file_path = f'data/cleaned_data/{year}_cleaned_data.csv'
    processed_df.to_csv(file_path, index=False)
    print(f'DataFrame for the year {year} saved to: {file_path}')

# Combine DataFrames into one DataFrame
combined_df = pd.concat(dfs.values(), ignore_index=True)

# Save the combined DataFrame to a CSV file
combined_file_path = 'data/cleaned_data/combined_data.csv'
combined_df.to_csv(combined_file_path, index=False)
print(f'Combined DataFrame saved to: {combined_file_path}')


DataFrame for the year 2015 saved to: data/cleaned_data/2015_cleaned_data.csv
DataFrame for the year 2016 saved to: data/cleaned_data/2016_cleaned_data.csv
DataFrame for the year 2018 saved to: data/cleaned_data/2018_cleaned_data.csv
DataFrame for the year 2019 saved to: data/cleaned_data/2019_cleaned_data.csv
DataFrame for the year 2020 saved to: data/cleaned_data/2020_cleaned_data.csv
Combined DataFrame saved to: data/cleaned_data/combined_data.csv
