In [1]:
# Import necessary libraries
import pandas as pd
from pyproj import Proj, Transformer
from pathlib import Path
import warnings

In [2]:
# Ignore FutureWarnings from pyproj
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Read the CSV file into a Pandas DataFrame
arrest_2020 = pd.read_csv('data/2020_Racial_Profiling__RP__dataset_20231109.csv')

# Filter records for 'Arrests' in the 'Type' column
arrests_2020 = arrest_2020[arrest_2020['Type'] == 'Arrests']

# Display the resulting DataFram
arrests_2020.to_csv('arrests_ONLY_2020_.csv', index=False)

In [4]:
# Define the desired column order
desired_column_order = ['arrest_date', 'arrest_time', 'sex', 'race', 'searched', 'reason_stopped', 'search_based_on',
                        'search_found', 'race_known', 'lng', 'lat']

# Function to process the DataFrame for a specific year
def process_dataframe(df, year):
    # Create a copy of the DataFrame
    df_copy = df.copy()

    # Convert 'X_COORDINATE' (lng) and 'Y_COORDINATE' (lat) columns to actual latitude and longitude
    if 'X_COORDINATE' in df_copy.columns and 'Y_COORDINATE' in df_copy.columns:
        # Replace non-numeric values with NaN
        df_copy[['X_COORDINATE', 'Y_COORDINATE']] = df_copy[['X_COORDINATE', 'Y_COORDINATE']].apply(pd.to_numeric, errors='coerce')

        # Define the projection (assuming the original coordinates are in EPSG:3081)
        in_proj = Proj(init='epsg:3081')
        out_proj = Proj(init='epsg:4326')  # WGS 84

        # Convert coordinates using the updated Pyproj method
        transformer = Transformer.from_proj(in_proj, out_proj)
        df_copy['lng'], df_copy['lat'] = transformer.transform(df_copy['X_COORDINATE'].values, df_copy['Y_COORDINATE'].values)

        # Drop the original 'X_COORDINATE' and 'Y_COORDINATE' columns
        df_copy = df_copy.drop(['X_COORDINATE', 'Y_COORDINATE'], axis=1)

    # Drop specified columns based on the year
    drop_columns = {
        2015: ['PRIMARY_KEY', 'LOCATION', 'SECTOR', 'LOCAL_FIELD1'],
        2016: ['PRIMARY_KEY', 'LOCATION', 'SECTOR', 'LOCAL_FIELD1'],
        2018: ['PRIMARY_KEY', 'LOCATION', 'APD_sector', 'CouncilDistrict', 'county_description', 'ZIP', 'CENSUS_TRACT'],
        2019: ['PRIMARY_KEY', 'LOCATION', 'APD_sector', 'CouncilDistrict', 'county_description', 'ZIP', 'CENSUS_TRACT'],
        2020: ['Stop Key', 'Type', 'TCOLE RACE ETHNICITY', 'Street_Type', 'TCOLE Result of Stop',
               'TCOLE Arrest Based On', 'CENSUS_TRACT', 'Council District', 'COUNTY', 'Custody', 'Location', 'Sector', 'Zip Code']
    }
    df_copy = df_copy.drop(columns=drop_columns.get(year, []), errors='ignore')

    # Rename columns based on the year
    rename_columns = {
        2015: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'SEX': 'sex', 'APD_RACE_DESC': 'race',
               'PERSON_SEARCHED_DESC': 'searched', 'REASON_FOR_STOP_DESC': 'reason_stopped',
               'SEARCH_BASED_ON_DESC': 'search_based_on', 'SEARCH_DISC_DESC': 'search_found', 'RACE_KNOWN': 'race_known',
               'X_COORDINATE': 'lng', 'Y_COORDINATE': 'lat'},
        2016: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'SEX': 'sex', 'APD_RACE_DESC': 'race',
               'PERSON_SEARCHED_DESC': 'searched', 'REASON_FOR_STOP_DESC': 'reason_stopped',
               'SEARCH_BASED_ON_DESC': 'search_based_on', 'SEARCH_DISC_DESC': 'search_found', 'RACE_KNOWN': 'race_known',
               'X_COORDINATE': 'lng', 'Y_COORDINATE': 'lat'},
        2018: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'X_COORDINATE': 'lng',
               'Y_COORDINATE': 'lat', 'RACE_KNOWN': 'race_known', 'Reason for Stop – TCOLE form': 'reason_stopped',
               'SEX': 'sex', 'APD_RACE_DESC': 'race', 'Person Search YN': 'searched', 'Search Based On': 'search_based_on',
               'Search Found': 'search_found'},
        2019: {'REP_DATE': 'arrest_date', 'REP_TIME': 'arrest_time', 'X_COORDINATE': 'lng',
               'Y_COORDINATE': 'lat', 'RACE_KNOWN': 'race_known', 'Reason for Stop – TCOLE form': 'reason_stopped',
               'SEX': 'sex', 'APD_RACE_DESC': 'race', 'Person Search YN': 'searched', 'Search Based On': 'search_based_on',
               'Search Found': 'search_found'},
        2020: {'TCOLE Sex': 'sex', 'Standardized Race Known': 'race_known',
               'Reason for Stop': 'reason_stopped', 'Search Yes or No': 'searched',
               'TCOLE Search Based On': 'search_based_on', 'TCOLE Search Found': 'search_found',
               'Standardized Race': 'race', 'Stop Date': 'arrest_date', 'Stop Time': 'arrest_time', 'X_COORDINATE': 'lng',
               'Y_COORDINATE': 'lat'},
    }
    df_copy = df_copy.rename(columns=rename_columns.get(year, {}))

    # Replace missing or null values with "unknown"
    columns_to_replace = ['sex', 'race', 'reason_stopped', 'search_based_on', 'search_found', 'race_known', 'searched']
    df_copy[columns_to_replace] = df_copy[columns_to_replace].fillna('not_listed')

    # Convert columns to appropriate data types
    df_copy['arrest_date'] = pd.to_datetime(df_copy['arrest_date'])
    df_copy['arrest_time'] = pd.to_datetime(df_copy['arrest_time'], errors='coerce')
    df_copy['sex'] = df_copy['sex'].astype('category')
    df_copy['race'] = df_copy['race'].astype('category')
    df_copy['searched'] = df_copy['searched'].astype('category')
    df_copy['reason_stopped'] = df_copy['reason_stopped'].astype('category')
    df_copy['search_based_on'] = df_copy['search_based_on'].astype('category')
    df_copy['search_found'] = df_copy['search_found'].astype('category')
    df_copy['race_known'] = df_copy['race_known'].astype('category')

    # Convert 'lng' and 'lat' columns to numeric, handling errors and coercion to NaN
    for col in ['lng', 'lat']:
        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
        # Fill NaN values with the mean
        df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
        # Convert columns to the appropriate data type
        df_copy[col] = df_copy[col].astype('float64')

    df_copy = df_copy[desired_column_order]
    return df_copy


# Dictionary to store DataFrames
dfs = {}

# List of CSV file paths
file_paths = {
    2015: "data/2015_Racial_Profiling_Arrests_20231113.csv",
    2016: "data/2016_RP_Arrests_20231113.csv",
    2018: "data/2018_RP_Arrests.csv",
    2019: "data/2019_Racial_Profiling__RP__Arrests_20231113.csv",
    2020: "data/arrests_ONLY_2020_.csv",
}

# Loop through each file, read data, and add DataFrame to the dictionary
for year, file_path in file_paths.items():
    # Read the CSV file into a Pandas DataFrame
    profiling_df = pd.read_csv(Path(file_path))
    
    # Process the DataFrame for the specific year
    processed_df = process_dataframe(profiling_df, year)

    # Add the processed DataFrame to the dictionary
    dfs[year] = processed_df

    # Check data types
    print(f"\nDTYPES FOR {year} DF:\n")
    print(processed_df.dtypes)

# Display the first few rows of each DataFrame
for year, df in dfs.items():
    print(f"\nDF for the year {year}:\n")
    display(df.head(3))



DTYPES FOR 2015 DF:

arrest_date        datetime64[ns]
arrest_time        datetime64[ns]
sex                      category
race                     category
searched                 category
reason_stopped           category
search_based_on          category
search_found             category
race_known               category
lng                       float64
lat                       float64
dtype: object

DTYPES FOR 2016 DF:

arrest_date        datetime64[ns]
arrest_time        datetime64[ns]
sex                      category
race                     category
searched                 category
reason_stopped           category
search_based_on          category
search_found             category
race_known               category
lng                       float64
lat                       float64
dtype: object

DTYPES FOR 2018 DF:

arrest_date        datetime64[ns]
arrest_time        datetime64[ns]
sex                      category
race                     category
searched              

Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2015-01-01,1970-01-01 00:00:00.000000002,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,7.181465,85.773044
1,2015-01-01,1970-01-01 00:00:00.000000317,M,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.341423,85.781942
2,2015-01-01,1970-01-01 00:00:00.000000317,F,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.341423,85.781942



DF for the year 2016:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2016-01-01,1970-01-01 00:00:00.000002355,F,WHITE,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,8.878931,85.846019
1,2016-01-02,1970-01-01 00:00:00.000000123,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,PROBABLE CAUSE,OTHER,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.051168,85.777929
2,2016-01-02,1970-01-01 00:00:00.000000123,M,HISPANIC OR LATINO,NO = 2,not_listed,not_listed,not_listed,not_listed,6.051168,85.777929



DF for the year 2018:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2018-03-26,1970-01-01 00:00:00.000002010,F,WHITE,YES,Violation of law other than traffic,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.945756,85.784765
1,2018-12-04,1970-01-01 00:00:00.000001032,M,BLACK,NO,Moving Traffic Violation,not_listed,not_listed,not_listed,7.246727,85.78649
2,2018-01-22,1970-01-01 00:00:00.000000930,F,BLACK,NO,Moving Traffic Violation,not_listed,not_listed,not_listed,8.538444,85.795643



DF for the year 2019:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2019-07-17,1970-01-01 00:00:00.000000338,M,WHITE,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.241904,85.792424
1,2019-07-27,1970-01-01 00:00:00.000002319,M,WHITE,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.395985,85.770888
2,2019-03-17,1970-01-01 00:00:00.000000303,F,BLACK,YES = 1,Violation of law other than traffic,INCIDENTAL TO ARREST,CASH,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,7.54383,85.77916



DF for the year 2020:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2020-03-08,1970-01-01 00:00:00.000002242,F,ASIAN,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,7.571724,85.763277
1,2020-07-12,1970-01-01 00:00:00.000000118,F,ASIAN,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.080236,85.763965
2,2020-08-22,1970-01-01 00:00:00.000000053,F,ASIAN,YES = 1,Moving Traffic Violation,INCIDENTAL TO ARREST,OTHER,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.420056,85.752398


In [5]:
# Define the directory to save the CSV files
output_directory = "data/cleaned_data"
# "C:\github_Repos\Predictive-Racial-Profiling\data\cleaned_data"
# Loop through each DataFrame, save it to a CSV file
for year, df in dfs.items():
    # Construct the file path
    output_file_path = Path(output_directory, f"{year}_cleaned_data.csv")

    # Save the DataFrame to a CSV file
    df.to_csv(output_file_path, index=False)

    print(f"DataFrame for the year {year} saved to: {output_file_path}")


DataFrame for the year 2015 saved to: data\cleaned_data\2015_cleaned_data.csv
DataFrame for the year 2016 saved to: data\cleaned_data\2016_cleaned_data.csv
DataFrame for the year 2018 saved to: data\cleaned_data\2018_cleaned_data.csv
DataFrame for the year 2019 saved to: data\cleaned_data\2019_cleaned_data.csv
DataFrame for the year 2020 saved to: data\cleaned_data\2020_cleaned_data.csv


In [6]:
# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dfs.values(), ignore_index=True)

# Display the first few rows of the combined DataFrame
print("\nCombined DataFrame:\n")
display(combined_df.head(3))

# Save the combined DataFrame to a CSV file
combined_output_path = Path(output_directory, "combined_cleaned_data.csv")
combined_df.to_csv(combined_output_path, index=False)

print(f"\nCombined DataFrame saved to: {combined_output_path}")


Combined DataFrame:



Unnamed: 0,arrest_date,arrest_time,sex,race,searched,reason_stopped,search_based_on,search_found,race_known,lng,lat
0,2015-01-01,1970-01-01 00:00:00.000000002,M,HISPANIC OR LATINO,YES = 1,CALL FOR SERVICE,INCIDENTAL TO ARREST,NOTHING,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,7.181465,85.773044
1,2015-01-01,1970-01-01 00:00:00.000000317,M,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.341423,85.781942
2,2015-01-01,1970-01-01 00:00:00.000000317,F,WHITE,YES = 1,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,INCIDENTAL TO ARREST,DRUGS,NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,6.341423,85.781942



Combined DataFrame saved to: data\cleaned_data\combined_cleaned_data.csv


In [7]:
# Initialize an empty DataFrame for combination
df_combined = pd.DataFrame()

# Concatenate DataFrames into a combined DataFrame
selected_columns = ['arrest_date', 'arrest_time', 'sex', 'race', 'reason_stopped', 'search_based_on', 'search_found', 'race_known', 'lng', 'lat', 'searched']

df_combined = pd.concat([dfs[year][selected_columns] for year in dfs.keys()], ignore_index=True)


In [8]:
# Find null values in the DataFrame
null_values = df_combined.isnull().sum()

# Display the columns with null values and their counts
print("Columns with Null Values:")
print(null_values[null_values > 0])

Columns with Null Values:
Series([], dtype: int64)


In [9]:
# List Categorical variables
categorical_columns = ['sex', 'race', 'reason_stopped', 'search_based_on', 'search_found', 'race_known']

# Drop unnecessary columns
df_combined = df_combined.drop(['arrest_date', 'arrest_time'], axis=1)

# Convert 'searched' column to binary (1 for YES, 0 for NO)
df_combined['searched'] = df_combined['searched'].apply(lambda x: 1 if x == 'YES' else 0)

# Convert categorical variables to numerical using one-hot encoding
df_combined = pd.get_dummies(df_combined, columns=categorical_columns, drop_first=False)

# Convert boolean columns to integers (0 or 1) for the entire DataFrame
df_combined = df_combined.astype(int)

# Manually remove the 'sex_F' column
df_combined = df_combined.drop('sex_F', axis=1)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df_combined.drop('searched', axis=1)
y = df_combined['searched']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X

Unnamed: 0,lng,lat,sex_M,sex_U,sex_not_listed,race_AMERICAN INDIAN/ALASKAN NATIVE,race_ASIAN,race_BLACK,race_HAWAIIAN/PACIFIC ISLANDER,race_HISPANIC OR LATINO,...,search_found_CASH,search_found_DRUGS,search_found_NOTHING,search_found_No Search,search_found_OTHER,search_found_WEAPONS,search_found_not_listed,race_known_NO - RACE OR ETHNICITY WAS NOT KNOWN BEFORE STOP,race_known_YES - RACE OR ETHNICITY WAS KNOWN BEFORE STOP,race_known_not_listed
0,7,85,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
1,6,85,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,6,85,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,7,85,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,7,85,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41418,7,85,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
41419,7,85,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
41420,8,85,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
41421,8,85,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [12]:
# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the standard scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model on the scaled data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8170187085093542

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.84      0.88      6450
           1       0.57      0.74      0.64      1835

    accuracy                           0.82      8285
   macro avg       0.74      0.79      0.76      8285
weighted avg       0.84      0.82      0.83      8285


Confusion Matrix:
 [[5409 1041]
 [ 475 1360]]
