In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file = r"../crime_data_chicago.csv"
crime_chic = pd.read_csv(file)

In [3]:
crime_chic.shape

(2278726, 23)

In [4]:
# Starting the indexing from 1 so it can suffice for serial number
crime_chic.index += 1 

In [5]:
crime_chic.columns

Index(['Unnamed: 0', 'ID', 'Case Number', 'Date', 'Block', 'IUCR',
       'Primary Type', 'Description', 'Location Description', 'Arrest',
       'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code',
       'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')

In [6]:
crime_chic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2278726 entries, 1 to 2278726
Data columns (total 23 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Unnamed: 0            int64  
 1   ID                    int64  
 2   Case Number           object 
 3   Date                  object 
 4   Block                 object 
 5   IUCR                  object 
 6   Primary Type          object 
 7   Description           object 
 8   Location Description  object 
 9   Arrest                bool   
 10  Domestic              bool   
 11  Beat                  int64  
 12  District              float64
 13  Ward                  float64
 14  Community Area        float64
 15  FBI Code              object 
 16  X Coordinate          float64
 17  Y Coordinate          float64
 18  Year                  int64  
 19  Updated On            object 
 20  Latitude              float64
 21  Longitude             float64
 22  Location              object 
dtypes: bool

In [7]:
#dropping the 'Unnamed:0' column
crime_chic = crime_chic.drop(columns=['Unnamed: 0']) #, axis = 1, inplace = True)

In [8]:
crime_chic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2278726 entries, 1 to 2278726
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Case Number           object 
 2   Date                  object 
 3   Block                 object 
 4   IUCR                  object 
 5   Primary Type          object 
 6   Description           object 
 7   Location Description  object 
 8   Arrest                bool   
 9   Domestic              bool   
 10  Beat                  int64  
 11  District              float64
 12  Ward                  float64
 13  Community Area        float64
 14  FBI Code              object 
 15  X Coordinate          float64
 16  Y Coordinate          float64
 17  Year                  int64  
 18  Updated On            object 
 19  Latitude              float64
 20  Longitude             float64
 21  Location              object 
dtypes: bool(2), float64(7), int64(3), object(1

In [9]:
crime_chic.isna().sum()

ID                           0
Case Number                  1
Date                         0
Block                        0
IUCR                         0
Primary Type                 0
Description                  0
Location Description      2877
Arrest                       0
Domestic                     0
Beat                         0
District                    12
Ward                    184695
Community Area          184267
FBI Code                     0
X Coordinate             23985
Y Coordinate             23985
Year                         0
Updated On                   0
Latitude                 23985
Longitude                23985
Location                 23985
dtype: int64

In [10]:
crime_chic = crime_chic.dropna()

In [11]:
#Checking for duplicates in the 'Case Number' column
crime_chic['Case Number'].duplicated().sum()

46

OBSERVATION: The ID has no duplicates and will be used as a reference.

#Viewing the list of Case Number duplicates
CaseNumber_duplicate

In [12]:
#Checking for duplicates in the 'ID' column
crime_chic['ID'].duplicated().sum()

0

OBSERVATION: There incidents with the same 'Case number'. These incident are all homicidal in nature, described as first degree murder, have the same 'IUCR' - 0110 and 'FBI code' - 01A

To this effect the column 'Case Number' will be dropped so each incident can be referenced with its 'ID'

In [20]:
#Dropping the 'Case number' column
crime_chic = crime_chic.drop(['Case Number'])

AttributeError: 'NoneType' object has no attribute 'drop'

In [17]:
crime_chic.info()

AttributeError: 'NoneType' object has no attribute 'info'

In [None]:
def missing_value_percentage(df):
    # Calculate the percentage of missing values for each column
    missing_percentage = df.isnull().mean().reset_index()
    missing_percentage.columns = ['Column', 'Missing Percentage']

#Converting the proportion to percentage
    missing_percentage['Missing Percentage'] *= 100

    return missing_percentage


In [None]:
missing_value_percentage(crime_chic)

OBSERVATION: Going by the percentage of missing values, it is safe to say that there negligible as it is very minimal.

Further columns will be dropped as they aren't neccesarilly needed in our analysis. They are:
1. 'Ward' and 'Community Area' : these two columns plays a less significant role in our analysis since have the 'Location'. if such information is needed, it can easily be traced to the 'District' and from there the we can locate the 'Ward' and 'Community Area' via a map or the district data. In order words, the 'District' mothered the the Ward and 'Community Area'.
2. 'X and Y Coordinates' : these two column will give us location of the crime, same as what the 'Location' column will give us.
3. 'Latitute' and 'Longitude' : these two columns makes up the 'Location' column with the format - (Latitude, longitude).


In [None]:
#Dropping further columns
crime_chic.drop(['Ward','Community Area','X Coordinate', 'Y Coordinate','Latitude', 'Longitude'], axis = 1, inplace = True)

In [None]:
#Changing the column 'ID' header to 'Case ID'
crime_chic.rename(columns={'ID': 'Case ID'}, inplace=True, errors='raise')

In [None]:
crime_chic.head(5)

In [104]:
crime_chic.columns

AttributeError: 'NoneType' object has no attribute 'columns'

In [105]:
crime_chic.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [106]:
crime_chic.isna().sum()

AttributeError: 'NoneType' object has no attribute 'isna'

In [107]:
crime_chic.District.unique()

AttributeError: 'NoneType' object has no attribute 'District'

In [108]:
# district 4 location values
crime_chic.groupby('District').get_group(4)[['Location']]

AttributeError: 'NoneType' object has no attribute 'groupby'

In [109]:
# district 10 location values
crime_chic.groupby('District').get_group(10)[['Location']]

AttributeError: 'NoneType' object has no attribute 'groupby'

In [110]:
#crime_chic.tail(2278659:2278668)
crime_chic.iloc[2278659:2278664]

AttributeError: 'NoneType' object has no attribute 'iloc'

In [111]:
missing_rows = crime_chic[crime_chic['District'].isnull()]

TypeError: 'NoneType' object is not subscriptable

In [112]:
missing_rows

Unnamed: 0,Case ID,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,FBI Code,Year,Updated On,Location


In [113]:
#Filling the 'nan' value in the 'District' column with '0'
crime_chic['District'] = crime_chic['District'].fillna(0)

TypeError: 'NoneType' object is not subscriptable

In [114]:
crime_chic.District.unique()

AttributeError: 'NoneType' object has no attribute 'District'

In [115]:
#Filling the 'nan' value in the 'Location Description' column with 'Unknown'
crime_chic['Location Description'] = crime_chic['Location Description'].fillna('Unknown')

TypeError: 'NoneType' object is not subscriptable

In [116]:
#first_missing_index = df['column_name'].index[df['column_name'].isnull()].tolist()[0]

In [117]:
crime_chic['Location'] = crime_chic.groupby('District')['Location'].transform(lambda x: x.fillna(x.mode()))

AttributeError: 'NoneType' object has no attribute 'groupby'

In [118]:
crime_chic.isna().sum()

AttributeError: 'NoneType' object has no attribute 'isna'

In [119]:
crime_chic.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [120]:
crime_chic.dropna(subset=['Location'], inplace=True)

AttributeError: 'NoneType' object has no attribute 'dropna'

In [121]:
crime_chic.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [122]:
crime_chic.isna().sum()

AttributeError: 'NoneType' object has no attribute 'isna'

In [123]:
# creating a column for months

from datetime import datetime

def extract_and_map_month(date):
    # ensure the input is in datetime format
    # 07/26/2008 02:30:00 PM
    date_column = pd.to_datetime(date, format='%m/%d/%Y %I:%M:%S %p')

    month_names = date_column.dt.month_name()

    return month_names

In [124]:
crime_chic['Month'] = extract_and_map_month(crime_chic['Date'])

TypeError: 'NoneType' object is not subscriptable

In [125]:
crime_chic.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [126]:
# creating a column for seasons

def create_seasons_column(date_column):
    """
    Creates a new column representing seasons based on the months in a datetime column.

    Parameters:
    - date_column: Pandas Series, representing a datetime feature

    Returns:
    - Pandas Series containing the corresponding season names
    """
    # Ensure the input is in datetime format
    date_column = pd.to_datetime(date_column, format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

    # Define a mapping of month to season
    month_to_season = {
        1: 'Winter', 2: 'Winter', 3: 'Spring',
        4: 'Spring', 5: 'Spring', 6: 'Summer',
        7: 'Summer', 8: 'Summer', 9: 'Autumn',
        10: 'Autumn', 11: 'Autumn', 12: 'Winter'
    }

    # Extract the month and map to season
    seasons_column = date_column.dt.month.map(month_to_season)

    return seasons_column

In [127]:
crime_chic['Seasons'] = create_seasons_column(crime_chic['Date'])

TypeError: 'NoneType' object is not subscriptable

In [128]:
crime_chic.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [129]:
crime_chic.isna().sum()

AttributeError: 'NoneType' object has no attribute 'isna'

In [130]:
crime_chic.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [131]:
crime_chic['Date'] = pd.to_datetime(df['Date'])

TypeError: 'NoneType' object does not support item assignment

In [None]:
crime_chic.info()

In [None]:
crime_chic.isnull().sum()

In [None]:
crime_chic.isna().sum()

In [None]:
# Extract day, month, year, and time into new columns
crime_chic['day'] = crime_chic['Date'].dt.day.astype(int)
#df['month'] = df['Date'].dt.strftime('%B') #extract full month name
crime_chic['year'] = crime_chic['Date'].dt.year.astype(int)
crime_chic['time'] = crime_chic['Date'].dt.strftime('%H:%M:%S')

In [None]:
# Extract the day of the week
crime_chic['day_of_week'] = crime_chic['Date'].dt.day_name()

In [None]:
crime_chic.head()

In [None]:
crime_chic.info()