# Exploratory Data Analysis

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plotting styles
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

In [2]:
df = pd.read_csv('Competition_Dataset.csv')

# Check the first few rows
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,Latitude (Y),Longitude (X)
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [None]:
# Check the structure of the dataset
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)

# A summary of missing values per column
print("\nMissing Values per Column:")
print(df.isnull().sum())

In [None]:
# Get basic info about columns and data types
df.info()

In [None]:
# Look at basic statistics for numerical columns
df.describe()

## Preprocessing

In [3]:
# Convert 'Dates' column to datetime type
df['Dates'] = pd.to_datetime(df['Dates'], errors='coerce')

# Check if conversion was successful
print(df['Dates'].head())

# Remove any rows where date conversion failed
df = df.dropna(subset=['Dates'])

# Reset index after cleaning
df.reset_index(drop=True, inplace=True)

print("Cleaned dataset shape:", df.shape)

0   2015-05-13 23:53:00
1   2015-05-13 23:53:00
2   2015-05-13 23:33:00
3   2015-05-13 23:30:00
4   2015-05-13 23:30:00
Name: Dates, dtype: datetime64[ns]
Cleaned dataset shape: (680826, 9)


In [4]:
# Remove duplicates if any exist
df = df.drop_duplicates()

# Reset index after cleaning
df.reset_index(drop=True, inplace=True)

print("Cleaned dataset shape:", df.shape)

Cleaned dataset shape: (669742, 9)


## Feature Engineering for Temporal Features

In [5]:
# Extract temporal features from the Dates column
df['Hour'] = df['Dates'].dt.hour
df['Day'] = df['Dates'].dt.day
df['Month'] = df['Dates'].dt.month
df['Year'] = df['Dates'].dt.year
df['DayOfWeek_Num'] = df['Dates'].dt.weekday  # Monday=0, Sunday=6

# Create time period bins (morning, afternoon, evening, night)
def assign_time_period(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['TimePeriod'] = df['Hour'].apply(assign_time_period)

# Check the newly created features
df[['Dates', 'Hour', 'Month', 'Year', 'DayOfWeek_Num', 'TimePeriod']].head()

Unnamed: 0,Dates,Hour,Month,Year,DayOfWeek_Num,TimePeriod
0,2015-05-13 23:53:00,23,5,2015,2,Night
1,2015-05-13 23:53:00,23,5,2015,2,Night
2,2015-05-13 23:33:00,23,5,2015,2,Night
3,2015-05-13 23:30:00,23,5,2015,2,Night
4,2015-05-13 23:30:00,23,5,2015,2,Night


In [None]:
df['TimePeriod'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Distribution of Incidents by Time Period')
plt.xlabel('Time Period')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=45)
plt.show()

## Feature Engineering for Geographical Data

In [6]:
# Verify coordinate ranges and create a column for geohash clusters if needed
# For simplicity, we use rounding to group nearby coordinates (e.g., to 3 decimal places)
df['Lat_round'] = df['Latitude (Y)'].round(3)
df['Lon_round'] = df['Longitude (X)'].round(3)

# Create a combined geolocation cluster identifier
df['GeoCluster'] = df['Lat_round'].astype(str) + "_" + df['Lon_round'].astype(str)

# Check a few examples of the cluster IDs
df[['Latitude (Y)', 'Longitude (X)', 'GeoCluster']].head()

Unnamed: 0,Latitude (Y),Longitude (X),GeoCluster
0,-122.425892,37.774599,-122.426_37.775
1,-122.425892,37.774599,-122.426_37.775
2,-122.424363,37.800414,-122.424_37.8
3,-122.426995,37.800873,-122.427_37.801
4,-122.438738,37.771541,-122.439_37.772


## Analysis

### Basic Descriptive Statistics and Categorical Analysis

In [None]:
# Summary statistics for numerical features
print("Numerical Summary:")
df.describe()

In [9]:
# Frequency counts for categorical features
print("Crime Category Counts:")
print(df['Category'].value_counts())

Crime Category Counts:
Category
LARCENY/THEFT             156764
OTHER OFFENSES            110678
NON-CRIMINAL               82880
VEHICLE THEFT              48274
VANDALISM                  39838
WARRANTS                   37951
DRUG/NARCOTIC              35487
BURGLARY                   32955
SUSPICIOUS OCC             28107
MISSING PERSON             23074
ROBBERY                    20689
FRAUD                      15006
FORGERY/COUNTERFEITING      8790
WEAPON LAWS                 6884
TRESPASS                    6580
DISORDERLY CONDUCT          3893
RECOVERED VEHICLE           2819
KIDNAPPING                  2092
STOLEN PROPERTY             2006
RUNAWAY                     1703
ARSON                       1390
EMBEZZLEMENT                1032
BAD CHECKS                   365
BRIBERY                      258
EXTORTION                    227
Name: count, dtype: int64


In [None]:
# Sample of descriptions for "OTHER OFFENSES" category
print("Sample Descriptions for 'OTHER OFFENSES':")
print(df[df['Category'] == 'OTHER OFFENSES']['Descript'].sample(5).to_string(index=False))

In [None]:
print("\nPolice District Counts:")
print(df['PdDistrict'].value_counts())

In [None]:
# Display resolution counts
print("Resolution Counts:")
print(df['Resolution'].value_counts())

### Visualizing Crime Distribution by Category and District

In [None]:
# Bar plot for crime category distribution
plt.figure(figsize=(14,6))
sns.countplot(data=df, y='Category', order=df['Category'].value_counts().index, palette='viridis', hue='Category')
plt.title('Crime Frequency by Category')
plt.xlabel('Count')
plt.ylabel('Crime Category')
plt.show()

# Bar plot for police district distribution
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='PdDistrict', order=df['PdDistrict'].value_counts().index, palette='magma', hue='PdDistrict')
plt.title('Crime Frequency by Police District')
plt.xlabel('Police District')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

### Temporal Analysis

In [None]:
# Plot number of crimes per hour of the day
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='Hour', palette='coolwarm', hue='Hour')
plt.title('Crime Count by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Number of Crimes')
plt.legend().remove() # Remove legend for clarity
plt.show()

# Plot number of crimes by day of week (using our DayOfWeek_Num)
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='DayOfWeek_Num', palette='Set2', hue='DayOfWeek_Num')
plt.title('Crime Count by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Crimes')
# Add day names to x-axis
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.xticks(range(7), day_names)
plt.legend().remove() # Remove legend for clarity
plt.show()

# Plot crimes by time period
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='TimePeriod', order=['Morning','Afternoon','Evening','Night'], palette='Set1', hue='TimePeriod')
plt.title('Crime Count by Time Period')
plt.xlabel('Time Period')
plt.ylabel('Number of Crimes')
plt.show()

# Plot crimes by month
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='Month', order=sorted(df['Month'].unique()), palette='Set3', hue='Month')
plt.title('Crime Count by Month')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend().remove() # Remove legend for clarity
plt.show()

# Plot crimes by year
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='Year', order=sorted(df['Year'].unique()), palette='Set3', hue='Year')
plt.title('Crime Count by Year')
plt.xlabel('Year')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=45)
plt.legend().remove() # Remove legend for clarity
plt.show()

## Text Data Feature Engineering

In [7]:
# Check a few examples of crime descriptions
print("Sample Crime Descriptions:")
print(df['Descript'].dropna().sample(5, random_state=42))

# Optional: Simple keyword extraction using string methods (more advanced analysis can be done with NLP libraries)
# For example, count common words in descriptions:
from collections import Counter
import re

# Combine all descriptions into a single string and extract words
descriptions = " ".join(df['Descript'].dropna().tolist()).lower()
words = re.findall(r'\w+', descriptions)
word_counts = Counter(words)

# Display the 10 most common words
print("\nTop 10 common words in descriptions:")
print(word_counts.most_common(10))

Sample Crime Descriptions:
580095    GRAND THEFT FROM LOCKED AUTO
387324             PROBATION VIOLATION
340214                  WARRANT ARREST
616727                TRAFFIC ACCIDENT
560065                    FOUND PERSON
Name: Descript, dtype: object

Top 10 common words in descriptions:
[('theft', 167304), ('from', 104257), ('grand', 95196), ('of', 94162), ('auto', 86025), ('locked', 73504), ('property', 73257), ('petty', 59564), ('stolen', 48432), ('violation', 42864)]


### Text Data Exploration

In [None]:
# List unique districts
unique_districts = df['PdDistrict'].unique()
unique_districts

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
from matplotlib.colors import to_rgba

def analyze_crime_trends(df, district=None, rolling_windows=[30, 90, 365], 
                         yearly_view='subplots', selected_window=30):
    """
    Analyze crime trends with flexible visualization options
    
    Parameters:
    - df: DataFrame with crime data
    - district: Optional district to filter by
    - rolling_windows: List of rolling window sizes (in days)
    - yearly_view: 'subplots' for separate charts or 'single' for color-coded single chart
    - selected_window: Which rolling window to use for yearly view (default: 30)
    """
    # Filter by district if specified
    if district:
        data = df[df['PdDistrict'] == district].copy()
        title_suffix = f" in {district} District"
    else:
        data = df.copy()
        title_suffix = " Across All Districts"
    
    # Daily count aggregation
    daily_counts = data.groupby(pd.Grouper(freq='D')).size()
    
    # Fill any missing days with zeros
    idx = pd.date_range(daily_counts.index.min(), daily_counts.index.max())
    daily_counts = daily_counts.reindex(idx, fill_value=0)
    
    # Create rolling windows 
    rolling_df = pd.DataFrame({'Daily': daily_counts})
    for window in rolling_windows:
        rolling_df[f'{window}-Day'] = daily_counts.rolling(window=window).sum()
    
    # Add year column for easier filtering
    rolling_df['Year'] = rolling_df.index.year
    
    # Plot main rolling window chart with dual axis
    fig, ax1 = plt.subplots(figsize=(14, 7))
    
    # Short-term windows on left axis
    short_windows = [w for w in rolling_windows if w < 100]
    for window in short_windows:
        ax1.plot(rolling_df.index, rolling_df[f'{window}-Day'], 
                 label=f'{window}-Day', linewidth=1.5)
    
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Short-term Crime Count', color='#1f77b4')
    ax1.tick_params(axis='y', labelcolor='#1f77b4')
    ax1.grid(True, alpha=0.3)
    
    # Long-term windows on right axis if present
    long_windows = [w for w in rolling_windows if w >= 100]
    if long_windows:
        ax2 = ax1.twinx()
        for window in long_windows:
            ax2.plot(rolling_df.index, rolling_df[f'{window}-Day'], 
                    label=f'{window}-Day', color='green', linewidth=2)
        ax2.set_ylabel('Long-term Crime Count', color='green')
        ax2.tick_params(axis='y', labelcolor='green')
    
    # Add a single legend combining both axes
    lines1, labels1 = ax1.get_legend_handles_labels()
    if long_windows:
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
    else:
        ax1.legend(loc='upper left')
    
    # Mark when windows start having complete data
    for window in rolling_windows:
        if window > 30:
            start_date = rolling_df.index[0] + pd.Timedelta(days=window-1)
            ax1.axvline(x=start_date, color='gray', linestyle='--', alpha=0.3)
            y_pos = 0.02 if window in short_windows else 0.98
            ax1.annotate(f'{window}-day window starts', 
                        xy=(start_date, y_pos),
                        xycoords=('data', 'axes fraction'),
                        rotation=90,
                        verticalalignment='bottom' if y_pos < 0.5 else 'top',
                        fontsize=8,
                        color='gray')
    
    # Improve x-axis formatting
    ax1.xaxis.set_major_locator(mdates.YearLocator())
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    ax1.xaxis.set_minor_locator(mdates.MonthLocator())
    
    plt.title(f'Rolling Crime Count{title_suffix}')
    plt.tight_layout()
    plt.show()
    
    # Create yearly visualization based on selected option
    years = sorted(rolling_df['Year'].unique())
    
    if yearly_view == 'subplots':
        # Multiple subplots approach
        num_cols = 3
        num_rows = (len(years) + num_cols - 1) // num_cols
        
        fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 4 * num_rows), 
                                sharex=False, sharey=True)
        axes = axes.flatten() if hasattr(axes, 'flatten') else [axes]
        
        # Find global y-limits for consistent scaling
        y_min, y_max = float('inf'), float('-inf')
        
        for year in years:
            year_data = rolling_df[rolling_df['Year'] == year]
            if not year_data.empty:
                curr_min = year_data[f'{selected_window}-Day'].min()
                curr_max = year_data[f'{selected_window}-Day'].max()
                y_min = min(y_min, curr_min)
                y_max = max(y_max, curr_max)
        
        # Add padding to y-limits
        y_range = y_max - y_min
        y_min = max(0, y_min - 0.05 * y_range)
        y_max = y_max + 0.05 * y_range
        
        # Plot each year
        for i, year in enumerate(years):
            year_data = rolling_df[rolling_df['Year'] == year]
            if year_data.empty:
                continue
                
            ax = axes[i]
            ax.plot(year_data.index, year_data[f'{selected_window}-Day'], 
                   linewidth=1.5)
            
            # Set consistent y-limits and grid
            ax.set_ylim(y_min, y_max)
            ax.grid(True, alpha=0.3)
            
            # Format dates to show months
            ax.xaxis.set_major_locator(mdates.MonthLocator())
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
            
            # Add year as title
            ax.set_title(f'{year}', fontsize=10)
        
        # Remove unused subplots
        for j in range(len(years), len(axes)):
            fig.delaxes(axes[j])
            
        plt.suptitle(f'Yearly Crime Trends ({selected_window}-Day Window){title_suffix}', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.show()
        
    else:  # 'single' chart with color-coded years
        plt.figure(figsize=(14, 7))
        
        # Generate a color palette
        cmap = plt.get_cmap('tab10')
        colors = [cmap(i % 10) for i in range(len(years))]
        
        # Plot each year with a different color
        for i, year in enumerate(years):
            year_data = rolling_df[rolling_df['Year'] == year]
            if year_data.empty:
                continue
                
            # Normalize dates to a common year (2000 is leap year) for overlay
            normalized_dates = []
            values = []
            
            for date, value in zip(year_data.index, year_data[f'{selected_window}-Day']):
                # Create a normalized date in year 2000 (leap year)
                norm_date = pd.Timestamp(2000, date.month, date.day)
                normalized_dates.append(norm_date)
                values.append(value)
            
            # Get a lighter version of the color for the fill
            color = colors[i]
            light_color = to_rgba(color, 0.2)
            
            plt.plot(normalized_dates, values, 
                    label=f'{year}', 
                    color=color,
                    linewidth=1.5)
        
        # Format x-axis to show months
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b'))
        
        plt.grid(True, alpha=0.3)
        plt.title(f'Crime Trends by Year ({selected_window}-Day Window){title_suffix}')
        plt.xlabel('Month')
        plt.ylabel('Crime Count')
        plt.legend(loc='best')
        plt.tight_layout()
        plt.show()
    
    return rolling_df

In [None]:
# Use the function with your dataset
result_df = analyze_crime_trends(df, district=unique_districts[0], yearly_view="single", rolling_windows=[30, 90, 365])

## Saving the Cleaned Dataset

In [None]:
# Summary of the dataset with new features
print("Dataframe Columns After Feature Engineering:")
print(df.columns.tolist())

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Save the cleaned dataset for future use
df.to_csv('Cleaned_Competition_Dataset.csv', index=False)

In [None]:
# display unique descripts
unique_descripts = df['Descript'].unique()
print(f"Unique Descriptions ({len(unique_descripts)}):")
print("\n".join(unique_descripts))

In [None]:
# Display unique resolutions
unique_resolutions = df['Resolution'].unique()
print(f"\nUnique Resolutions ({len(unique_resolutions)}):")
print("\n".join(unique_resolutions)) 

In [None]:
# Example using VADER for sentiment analysis:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

In [None]:
sia = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sia.polarity_scores(text)['compound']

# Assume 'rephrased_text' is the output from your rephrasing pipeline:
rephrased_text = df["Descript"].sample(1).values[0]  # Sample a description for demonstration
print("Rephrased Text:", rephrased_text)
sentiment_score = get_sentiment_score(rephrased_text)
print("Sentiment Score:", sentiment_score)

# Now, when creating your feature vector, you can include:
# - Text embeddings from your rephrased text
# - Sentiment score as an additional feature
# - Other features (e.g., time, location, categorical variables)


In [10]:
import pandas as pd
import geopandas
import folium
import matplotlib.pyplot as plt

In [13]:
geometry = geopandas.points_from_xy(df["Longitude (X)"], df["Latitude (Y)"])
geo_df = geopandas.GeoDataFrame(
    df[["Year", "Category", "PdDistrict", "Latitude (Y)", "Longitude (X)", "TimePeriod"]], geometry=geometry
)

geo_df.head()

Unnamed: 0,Year,Category,PdDistrict,Latitude (Y),Longitude (X),TimePeriod,geometry
0,2015,WARRANTS,NORTHERN,-122.425892,37.774599,Night,POINT (37.775 -122.426)
1,2015,OTHER OFFENSES,NORTHERN,-122.425892,37.774599,Night,POINT (37.775 -122.426)
2,2015,OTHER OFFENSES,NORTHERN,-122.424363,37.800414,Night,POINT (37.8 -122.424)
3,2015,LARCENY/THEFT,NORTHERN,-122.426995,37.800873,Night,POINT (37.801 -122.427)
4,2015,LARCENY/THEFT,PARK,-122.438738,37.771541,Night,POINT (37.772 -122.439)


In [21]:
fmap = folium.Map(location=[37.775, -122.426], tiles="CartoDB Positron", zoom_start=12)
fmap

In [23]:
# Create a geometry list from the GeoDataFrame
geo_df_list = [[point.xy[1][0], point.xy[0][0]] for point in geo_df.geometry]

# Iterate through list and add a marker for each volcano, color-coded by its type.
i = 0
# Limit the number of points for performance - using sample
sample_size = 1000  # Adjust based on performance needs
sampled_df = geo_df.sample(sample_size, random_state=42)
sampled_points = [[point.xy[1][0], point.xy[0][0]] for point in sampled_df.geometry]

# Define a color mapping for common crime categories
color_mapping = {
    'LARCENY/THEFT': 'blue',
    'OTHER OFFENSES': 'gray',
    'VANDALISM': 'orange',
    'ASSAULT': 'red',
    'ROBBERY': 'purple',
    'BURGLARY': 'darkred',
    'DRUG/NARCOTIC': 'green',
    'WARRANTS': 'lightred',
    'FORGERY/COUNTERFEITING': 'lightgreen',
    'NON-CRIMINAL': 'lightblue'
}

# Iterate through the sampled points
for i, coordinates in enumerate(sampled_points):
    crime_category = sampled_df.iloc[i]['Category']
    
    # Assign a color based on crime category
    if crime_category in color_mapping:
        marker_color = color_mapping[crime_category]
    else:
        marker_color = 'cadetblue'  # Default color for other categories

    # Create popup content
    popup_content = (
        f"Year: {sampled_df.iloc[i]['Year']}<br>"
        f"Category: {crime_category}<br>"
        f"District: {sampled_df.iloc[i]['PdDistrict']}<br>"
        f"Time: {sampled_df.iloc[i]['TimePeriod']}<br>"
        f"Coordinates: {coordinates}"
    )
    
    # Place the marker on the map
    fmap.add_child(
        folium.Marker(
            location=coordinates,
            popup=popup_content,
            icon=folium.Icon(color=marker_color),
        )
    )

In [24]:
fmap