In [1]:
import pandas as pd
import numpy as np

In [3]:

# Load the dataset
file_path = 'C:\\Users\\rloke\\OneDrive\\Desktop\\Datasets\\Chicago_Crimes.csv'  # Replace with your file path
chicago_crimes = pd.read_csv(file_path)

In [4]:
missing_values=chicago_crimes.isnull().sum()
print(missing_values)

ID                                0
Case Number                       0
Date                              0
Block                             0
IUCR                              0
Primary Type                      0
Description                       0
Location Description           3299
Arrest                            0
Domestic                          0
Beat                              0
District                          0
Ward                             25
Community Area                    1
FBI Code                          0
X Coordinate                  14348
Y Coordinate                  14348
Year                              0
Updated On                        0
Latitude                      14348
Longitude                     14348
Location                      14348
Historical Wards 2003-2015    16791
Zip Codes                     14348
Community Areas               16468
Census Tracts                 16458
Wards                         16459
Boundaries - ZIP Codes      

In [9]:
# Columns identified as essential for missing value removal
essential_columns = ['IUCR' ,'Location Description', 'Beat','District','Ward','Community Area','FBI Code','X Coordinate','Y Coordinate','Location',
                     'Historical Wards 2003-2015','Community Areas','Census Tracts','Wards','Boundaries - ZIP Codes',
                     'Police Districts','Police Beats','Wards 2023-']
chicago_crimes_cleaned = chicago_crimes.dropna(subset=essential_columns)

In [11]:
# Columns identified for removal
essential_columns_to_drop = ['IUCR' , 'Beat','District','Ward','Community Area','FBI Code','X Coordinate','Y Coordinate','Location',
                     'Historical Wards 2003-2015','Community Areas','Census Tracts','Wards','Boundaries - ZIP Codes',
                     'Police Districts','Police Beats','Wards 2023-']

# Drop the specified columns
chicago_crimes_cleaned = chicago_crimes_cleaned.drop(columns=essential_columns_to_drop)


In [12]:
print(chicago_crimes_cleaned.isnull().sum())

ID                      0
Case Number             0
Date                    0
Block                   0
Primary Type            0
Description             0
Location Description    0
Arrest                  0
Domestic                0
Year                    0
Updated On              0
Latitude                0
Longitude               0
Zip Codes               0
dtype: int64


In [13]:
# Standardize text to title case for specified columns
text_columns = ['Primary Type', 'Description', 'Location Description']
for col in text_columns:
    chicago_crimes_cleaned[col] = chicago_crimes_cleaned[col].str.title()



In [14]:
# Ensure boolean consistency
chicago_crimes_cleaned['Arrest'] = chicago_crimes_cleaned['Arrest'].astype(bool)
chicago_crimes_cleaned['Domestic'] = chicago_crimes_cleaned['Domestic'].astype(bool)



In [16]:
# Remove duplicates
chicago_crimes_cleaned = chicago_crimes_cleaned.drop_duplicates()

# Function to calculate IQR and filter outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# List of numerical columns to check for outliers
numerical_columns = ['Latitude', 'Longitude']
for col in numerical_columns:
    chicago_crimes_cleaned = remove_outliers(chicago_crimes_cleaned, col)


In [17]:
# Filter Latitude and Longitude for reasonable values in Chicago
chicago_lat_min, chicago_lat_max = 41.6, 42.1
chicago_long_min, chicago_long_max = -87.9, -87.5
chicago_crimes_cleaned = chicago_crimes_cleaned[
    (chicago_crimes_cleaned['Latitude'] >= chicago_lat_min) & 
    (chicago_crimes_cleaned['Latitude'] <= chicago_lat_max) & 
    (chicago_crimes_cleaned['Longitude'] >= chicago_long_min) & 
    (chicago_crimes_cleaned['Longitude'] <= chicago_long_max)
]

# Convert specified columns to categorical data types
categorical_columns = ['Primary Type', 'Description', 'Location Description']
for col in categorical_columns:
    chicago_crimes_cleaned[col] = chicago_crimes_cleaned[col].astype('category')

# Your cleaned dataset is now in chicago_crimes_cleaned


In [18]:
# Specify the file path for the new CSV
output_file_path = 'chicago_crimes_cleaned.csv'  # You can change this to your desired file path

# Save the cleaned DataFrame to a CSV file
chicago_crimes_cleaned.to_csv(output_file_path, index=False)
