In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# Open the CSV file
with open('cleandata/CleanSearchData.csv') as file:
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file)

df.head()

In [None]:
# df.head(1000).to_csv('streetsample.csv')

## stop and serach data plots

In [None]:
# Load the dataset
file_path = 'cleandata/CleanSearchData.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the data structure
data.head(), data.info()


In [None]:
data['Outcome'].unique()

In [None]:
# Convert 'Date' to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Drop columns
data.drop(['Longitude', 'Latitude', 'Legislation'], axis=1, inplace=True)

# Drop rows with null values in columns of interest
columns_of_interest = ['Gender', 'Age range', 'Self-defined ethnicity', 
                       'Officer-defined ethnicity', 'Object of search']
data_clean = data.dropna(subset=columns_of_interest)

data_clean.info(), data_clean.head()


In [None]:
# Set the plotting style
sns.set(style="whitegrid")

# Crime Density Over Time: Line plot of searches over time
data_clean['Month-Year'] = data_clean['Date'].dt.to_period('M').dt.to_timestamp()
time_density = data_clean.groupby('Month-Year').size()

plt.figure(figsize=(12, 6))
time_density.plot(title='Search Operations Density Over Time', color='blue', marker='o')
plt.xlabel('Date')
plt.ylabel('Number of Searches')
plt.grid(True)
plt.show()

# Preparing other data for visualization
# Crime Count per Age
age_counts = data_clean['Age range'].value_counts()

# Crime Count per Gender
gender_counts = data_clean['Gender'].value_counts()

# Crime Count per Self-Defined Ethnicity
self_ethnicity_counts = data_clean['Self-defined ethnicity'].value_counts()

# Crime Count per Officer-Defined Ethnicity
officer_ethnicity_counts = data_clean['Officer-defined ethnicity'].value_counts()

# Crime Count per Object of Search
object_search_counts = data_clean['Object of search'].value_counts()

# Plotting function for the bar plots
def plot_bar(data, title, xlabel, ylabel, color, filename):
    plt.figure(figsize=(12, 6))
    sns.barplot(x=data.values, y=data.index, palette=color)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
#     plt.savefig(filename)
    plt.show()

# plots for Age, Gender, Ethnicities, and Objects of search
# Plots for Age, Gender, Ethnicities, and Objects of search
plot_bar(age_counts, 'Search Operations per Age', 'Number of Searches', 'Age Range', 'Blues_r', 'search1a.png')
plot_bar(gender_counts, 'Search Operations per Gender', 'Number of Searches', 'Gender', 'Blues_r', 'search1b.png')
plot_bar(self_ethnicity_counts, 'Search Operations per Self-Defined Ethnicity', 'Number of Searches', 'Self-Defined Ethnicity', 'Blues_r', 'search1c.png')
plot_bar(officer_ethnicity_counts, 'Search Operations per Officer-Defined Ethnicity', 'Number of Searches', 'Officer-Defined Ethnicity', 'Blues_r', 'search1d.png')
plot_bar(object_search_counts, 'Search Operations per Object of Search', 'Number of Searches', 'Object of Search', 'Blues_r', 'search1e.png')



## 2nd try with Search data

In [None]:
# Load the dataset
file_path = 'cleandata/CleanSearchData.csv'
data = pd.read_csv(file_path)

data['Date'] = pd.to_datetime(data['Date'])
data.drop(['Longitude', 'Latitude', 'Legislation'], axis=1, inplace=True)

# Drop rows with null values in the relevant columns
data_clean = data.dropna(subset=['Gender', 'Age range', 'Self-defined ethnicity', 
                                 'Officer-defined ethnicity', 'Object of search', 'Outcome'])

# Check the cleaned data
data_clean.info(), data_clean.head()


In [None]:
# Re-attempt to load the dataset and prepare it
data = pd.read_csv(file_path)

# Convert 'Date' to datetime, drop unwanted columns and drop rows with null values
data['Date'] = pd.to_datetime(data['Date'])
data.drop(['Longitude', 'Latitude', 'Legislation'], axis=1, inplace=True)
data_clean = data.dropna(subset=columns_of_interest)

# Confirm changes
data_clean.info()


In [None]:
# Reload the dataset to handle potential issues with previous load attempts
try:
    data = pd.read_csv(file_path)
except Exception as e:
    raise Exception(f"Failed to load data: {str(e)}")

# Convert 'Date' to datetime, drop unwanted columns and drop rows with null values in relevant columns
try:
    data['Date'] = pd.to_datetime(data['Date'])
    data.drop(['Longitude', 'Latitude', 'Legislation'], axis=1, inplace=True)
    data_clean = data.dropna(subset=columns_of_interest)
except Exception as e:
    raise Exception(f"Failed to preprocess data: {str(e)}")

data_clean.info()
# Attempt to handle the file and preprocess data in a consolidated step
try:
    # Load the dataset
    data = pd.read_csv(file_path)
    # Convert 'Date' to datetime
    data['Date'] = pd.to_datetime(data['Date'])
    # Drop unwanted columns
    data.drop(['Longitude', 'Latitude', 'Legislation'], axis=1, inplace=True)
    # Drop rows with null values in columns of interest
    data_clean = data.dropna(subset=columns_of_interest)
except Exception as e:
    raise Exception(f"Error processing data: {str(e)}")

# Confirm the data is ready
data_clean.info()


In [None]:
# Load the data
data = pd.read_csv('cleandata/CleanSearchData.csv')  # Replace with the path to your dataset

# Preprocess the data
data['Date'] = pd.to_datetime(data['Date'])
data.drop(['Longitude', 'Latitude', 'Legislation'], axis=1, inplace=True)
data = data.dropna(subset=['Gender', 'Age range', 'Self-defined ethnicity', 
                           'Officer-defined ethnicity', 'Object of search', 'Outcome'])

# Set up the plotting environment
# Set up the plotting environment
sns.set(style="whitegrid")

# Define a color palette
palette = sns.color_palette("Blues_r")

# Plot Outcomes by Age
plt.figure(figsize=(12, 6))
sns.countplot(x='Age range', hue='Outcome', data=data, palette=palette)
plt.title('Search Outcomes by Age')
plt.xticks(rotation=45)
plt.tight_layout()
# plt.savefig('search2a.png')
plt.show()

# Plot Outcomes by Gender
plt.figure(figsize=(12, 6))
sns.countplot(x='Gender', hue='Outcome', data=data, palette=palette)
plt.title('Search Outcomes by Gender')
plt.tight_layout()
# plt.savefig('search2b.png')
plt.show()


# Plot Outcomes by Officer-Defined Ethnicity
plt.figure(figsize=(12, 6))
sns.countplot(x='Officer-defined ethnicity', hue='Outcome', data=data, palette=palette)
plt.title('Search Outcomes by Officer-Defined Ethnicity')
plt.xticks(rotation=90)
plt.tight_layout()
# plt.savefig('search2d.png')
plt.show()

# Plot Outcomes by Object of Search
plt.figure(figsize=(12, 6))
sns.countplot(x='Object of search', hue='Outcome', data=data, palette=palette)
plt.title('Search Outcomes by Object of Search')
plt.xticks(rotation=45)
plt.tight_layout()
# plt.savefig('search2e.png')
plt.show()



In [None]:
# Shorten the category labels
short_labels = {
    'White - English/Welsh/Scottish/Northern Irish/British': 'White - British',
    'Other ethnic group - Not stated': 'Other - Not stated',
    'Black/African/Caribbean/Black British - Any other Black/African/Caribbean background': 'Black - Other',
    'Asian/Asian British - Pakistani': 'Asian - Pakistani',
    'White - Irish': 'White - Irish',
    'Black/African/Caribbean/Black British - African': 'Black - African',
    'Other ethnic group - Any other ethnic group': 'Other - Any other',
    'Mixed/Multiple ethnic groups - Any other Mixed/Multiple ethnic background': 'Mixed - Other',
    'Asian/Asian British - Bangladeshi': 'Asian - Bangladeshi',
    'Black/African/Caribbean/Black British - Caribbean': 'Black - Caribbean',
    'Asian/Asian British - Any other Asian background': 'Asian - Other',
    'White - Any other White background': 'White - Other',
    'Mixed/Multiple ethnic groups - White and Black Caribbean': 'Mixed - White and Black Caribbean',
    'Mixed/Multiple ethnic groups - White and Asian': 'Mixed - White and Asian',
    'Asian/Asian British - Indian': 'Asian - Indian',
    'Mixed/Multiple ethnic groups - White and Black African': 'Mixed - White and Black African',
    'Asian/Asian British - Chinese': 'Asian - Chinese'
}


# Apply the short labels to the 'Self-defined ethnicity' column
data['Short Self-defined ethnicity'] = data['Self-defined ethnicity'].map(short_labels)

# Generate a pivot table where the columns are the outcomes and the index is the shortened ethnicity
pivot_data = data.pivot_table(index='Short Self-defined ethnicity', columns='Outcome', aggfunc='size', fill_value=0)

# Normalize the data by row to get proportions
pivot_normalized = pivot_data.div(pivot_data.sum(axis=1), axis=0)

# Sort the y-axis labels
def custom_sort(label):
    if label.startswith('Other -'):
        return (2, label)  # Higher number to sort these last
    elif label.endswith('- Other'):
        return (1, label)  # Middle group
    else:
        return (0, label)  # Everything else, sorted first

# Sort index according to custom rules
sorted_index = sorted(pivot_normalized.index, key=custom_sort)

# Reindex DataFrame with new order
pivot_normalized = pivot_normalized.reindex(sorted_index)

# Plotting
fig, ax = plt.subplots(figsize=(14, 10))
pivot_normalized.plot(kind='barh', stacked=True, colormap='Blues', ax=ax)
ax.set_title('Proportional Search Outcomes by Self-Defined Ethnicity', fontsize=14)
ax.set_xlabel('Proportion of Search Outcomes', fontsize=14)
ax.set_ylabel('Self-Defined Ethnicity', fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)

# Invert the x-axis to have the '1.0' start on the left
ax.set_xlim(ax.get_xlim()[::-1])

# Handle the legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title='Outcome', loc='upper right', bbox_to_anchor=(1, 1))

plt.tight_layout()
# plt.savefig('normalized_outcomes_ordered_y_axis.png')
plt.show()


## Street dataset


In [None]:
file_path = 'cleandata/CleanStreetData.csv'
data = pd.read_csv(file_path)
data['Last outcome category'].unique()

In [None]:
# Load the dataset
file_path = 'cleandata/CleanStreetData.csv'
data = pd.read_csv(file_path)

In [None]:
data.drop(['Longitude', 'Latitude', 'LSOA code', 'Location'], axis=1, inplace=True)

In [None]:
crime_density = data['Month'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
crime_density.plot(title='Crime Density Over Time')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.tight_layout()
# plt.savefig('street1a.png')
plt.show()


In [None]:
crime_type_count = data['Crime type'].value_counts()
plt.figure(figsize=(12, 6))
crime_type_count.plot(kind='bar', title='Crime Type Count')
plt.xlabel('Crime Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
# plt.savefig('street1b.png')
plt.show()


In [None]:
variables = ['Crime type']  # Adjust column names as needed
for var in variables:
    pd.crosstab(data[var], data['Last outcome category']).plot(kind='bar', stacked=True, figsize=(24, 12))
    plt.title(f'Outcome by {var}')
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
#     plt.savefig('street1c.png')
    plt.show()


## PAS data borough


In [None]:
# Load the dataset to understand its structure and contents
data = pd.read_csv('cleandata/pas_data_borough.csv')
data.head()


In [None]:
# Convert 'Date' to datetime type
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
# Filter data for the 'Trust MPS' measure
trust_mps_data = data[data['Measure'] == 'Trust MPS']

# Select a few boroughs for the time series plot
selected_boroughs = ['Barking and Dagenham', 'Barnet', 'Bexley', 'Brent', 'Bromley']

# Filter data for these boroughs
trust_mps_filtered = trust_mps_data[trust_mps_data['Borough'].isin(selected_boroughs)]

# Create a time series plot for 'Trust MPS'
plt.figure(figsize=(14, 7))
sns.lineplot(data=trust_mps_filtered, x='Date', y='Proportion', hue='Borough', marker='o')
plt.title('Time Series of "Trust MPS" by Borough')
plt.xlabel('Date')
plt.ylabel('Proportion')
plt.legend(title='Borough')
plt.grid(True)
plt.tight_layout()
# plt.savefig('trust1.png')
plt.show()


In [None]:
# Calculate the average 'Trust MPS' proportion for each borough
average_trust_mps = trust_mps_data.groupby('Borough')['Proportion'].mean().sort_values()

# Create a bar chart for 'Trust MPS'
plt.figure(figsize=(14, 7))
average_trust_mps.plot(kind='bar', color='lightblue')
plt.title('Average "Trust MPS" by Borough')
plt.xlabel('Borough')
plt.ylabel('Average Proportion')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
# plt.savefig('trust2.png')
plt.show()


In [None]:
# Create a histogram of the 'Trust MPS' values
plt.figure(figsize=(10, 6))
sns.histplot(trust_mps_data['Proportion'], bins=20, kde=True, color='purple')
plt.title('Distribution of "Trust MPS" Values')
plt.xlabel('Proportion')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
# plt.savefig('trust3.png')
plt.show()


## PAS data MPS - trust MPS

In [None]:
# Load the new dataset to understand its structure and contents
new_data = pd.read_csv('cleandata/pas_data_MPS.csv')
new_data.head()


In [None]:
# Convert 'Date' to datetime type for easier handling
new_data['Date'] = pd.to_datetime(new_data['Date'])

# Filter data for the 'Trust MPS' measure
trust_mps_only = new_data[new_data['Measure'] == 'Trust MPS']

# Check the first few rows of the filtered data to ensure correctness
trust_mps_only.head()


In [None]:
# Create a time series plot for 'Trust MPS'
plt.figure(figsize=(12, 6))
sns.lineplot(data=trust_mps_only, x='Date', y='Proportion', marker='o', color='blue')
plt.title('Time Series of "Trust MPS"')
plt.xlabel('Date')
plt.ylabel('Trust MPS Proportion')
plt.grid(True)
plt.tight_layout()
# plt.savefig('trust1(MPS).png')
plt.show()


In [None]:
# Compute descriptive statistics for 'Trust MPS'
trust_mps_statistics = trust_mps_only['Proportion'].describe()
trust_mps_statistics
