In [1]:
import pandas as pd
import os

# List of CSV file names to merge
file_names = [
    'crime_data_2015.csv', 'crime_data_2016.csv', 'crime_data_2017.csv', 
    'crime_data_2018.csv', 'crime_data_2019.csv', 'crime_data_2020.csv', 
    'crime_data_2021.csv', 'crime_data_2022.csv', 'crime_data_2023.csv'
]

# CSV files location
directory_path = r'C:\Users\Kayode\Desktop\MSc Data Science UWE\Second Semester\CSCT Masters Project\Dataset'

# Read and append all dataframes in a list comprehension
dataframes_list = [pd.read_csv(os.path.join(directory_path, file), low_memory=False) for file in file_names]

# Concatenate all dataframes in the list into a single dataframe and save to a new CSV file
pd.concat(dataframes_list, ignore_index=True).to_csv('crime_data_merged.csv', index=False)

In [None]:
import pandas as pd

# Path to the file
file_path = 'crime_data_merged.csv'

# Read the file
df = pd.read_csv(file_path, low_memory=False)

# Display the first few rows 
df.head()

In [None]:
# Loop through all columns
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Save and read the processed DataFrame
df.to_csv('crime_data_model.csv', index=False)
df = pd.read_csv('crime_data_model.csv', low_memory=False)

# Display the first few rows of the new DataFrame
df.head()

In [None]:
import pandas as pd

# Path to the file
file_path = 'crime_data_model.csv'

# Read the file
df = pd.read_csv(file_path, low_memory=False)

# Distribution of the target variable
sns.countplot(x='HOUR', data=df)
plt.title('Distribution of Crimes Over Hours')
plt.show()

In [None]:
# Distribution of crimes over weekdays
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='DAY_OF_WEEK')
plt.title('Distribution of Crimes Over Weekdays')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data
models = ['DT', 'SVM', 'RF', 'XGBoost']
accuracy = [77, 86, 94, 97]
precision = [77, 90.75, 95.75, 97.25]
recall = [82, 85.25, 95.5, 97]
f1_score = [77, 85.25, 95.25, 97.25]

# Creating a DataFrame
df = pd.DataFrame({
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score
}, index=models)

# Plotting
ax = df.plot(kind='bar', figsize=(10, 7))

# Adding titles and labels
plt.title('Performance Metrics of Machine Learning Models')
plt.xlabel('Models')
plt.ylabel('Percentage')
plt.xticks(rotation=0)

# Showing the plot
plt.show()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Create a list of categorical columns
categorical_columns = ['OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'DAY_OF_WEEK', 'UCR_PART', 'STREET']

# Initialize a SimpleImputer for categorical data
imputer = SimpleImputer(strategy='most_frequent')

# Initialize a LabelEncoder
label_encoder = LabelEncoder()

# Loop through each categorical column
for column in categorical_columns:
    # Impute missing values using the most frequent value in the column
    crime_data_model[column] = imputer.fit_transform(crime_data_model[[column]])
    
    # Apply label encoding to the column
    crime_data_model[column] = label_encoder.fit_transform(crime_data_model[column])

# Check the dataframe to ensure the changes
print(crime_data_model[categorical_columns].head())

# Assuming 'df' is your DataFrame
# Replace 'new_file.csv' with your desired file name
crime_data_model.to_csv('crime_data_model.csv', index=False)  # Set index=False to not save row indices



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('C:\Users\Kayode\Desktop\MSc Data Science UWE\Second Semester\CSCT Masters Project\Dataset\crime_data_model.csv')

X = df.drop('HOUR', axis=1)  # Features (all columns except the target)
y = df['HOUR']               # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Load the data
crime_data_model9 = pd.read_csv('crime_data_model9.csv')

# Assuming 'df' is your DataFrame and 'HOUR' is the target column
# Replace 'feature_columns' with the actual columns you want to use as features
feature_columns = crime_data_model9.columns.drop('HOUR')
X = crime_data_model9[feature_columns]
y = crime_data_model9['HOUR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
dtree = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
dtree.fit(X_train, y_train)

# Predict on training data
train_preds = dtree.predict(X_train)

# Evaluate the model on the training data
print("Training Data Evaluation:")
print("Classification Report:")
print(classification_report(y_train, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

# Predict on testing data
test_preds = dtree.predict(X_test)

# Evaluate the model on the testing data
print("Testing Data Evaluation:")
print("Classification Report:")
print(classification_report(y_test, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Load the data
crime_data_model9 = pd.read_csv('crime_data_model.csv')

# Assuming 'df' is your DataFrame and 'HOUR' is the target column
# Replace 'feature_columns' with the actual columns you want to use as features
feature_columns = crime_data_model9.columns.drop('HOUR')
X = crime_data_model9[feature_columns]
y = crime_data_model9['HOUR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on training data
train_preds = rf.predict(X_train)

# Evaluate the model on the training data
print("Training Data Evaluation:")
print("Classification Report:")
print(classification_report(y_train, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

# Predict on testing data
test_preds = rf.predict(X_test)

# Evaluate the model on the testing data
print("Testing Data Evaluation:")
print("Classification Report:")
print(classification_report(y_test, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Load the data
crime_data_model = pd.read_csv('C:\Users\Kayode\Desktop\MSc Data Science UWE\Second Semester\CSCT Masters Project\Dataset\crime_data_model.csv')

# Prepare the features and target
feature_columns = crime_data_model.columns.drop('HOUR')
X = crime_data_model[feature_columns]
y = crime_data_model['HOUR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the SVM Classifier
svm_model = SVC(random_state=42)

# Fit the model on the training data
svm_model.fit(X_train_scaled, y_train)

# Perform cross-validation
cv_scores = cross_val_score(svm_model, X_train_scaled, y_train, cv=5)  # 5-fold cross-validation
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

# Predict on training data
train_preds = svm_model.predict(X_train_scaled)

# Evaluate the model on the training data
print("Training Data Evaluation:")
print("Classification Report:")
print(classification_report(y_train, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

# Predict on testing data
test_preds = svm_model.predict(X_test_scaled)

# Evaluate the model on the testing data
print("Testing Data Evaluation:")
print("Classification Report:")
print(classification_report(y_test, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Load the data
crime_data_model = pd.read_csv('C:\Users\Kayode\Desktop\MSc Data Science UWE\Second Semester\CSCT Masters Project\Dataset\crime_data_model9.csv')

# Prepare the features and target
feature_columns = crime_data_model.columns.drop('HOUR')
X = crime_data_model[feature_columns]
y = crime_data_model['HOUR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the SVM Classifier
svm_model = SVC(random_state=42)

# Fit the model on the training data
svm_model.fit(X_train_scaled, y_train)

# Perform cross-validation
cv_scores = cross_val_score(svm_model, X_train_scaled, y_train, cv=5)  # 5-fold cross-validation
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

# Predict on training data
train_preds = svm_model.predict(X_train_scaled)

# Evaluate the model on the training data
print("Training Data Evaluation:")
print("Classification Report:")
print(classification_report(y_train, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

# Predict on testing data
test_preds = svm_model.predict(X_test_scaled)

# Evaluate the model on the testing data
print("Testing Data Evaluation:")
print("Classification Report:")
print(classification_report(y_test, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

# Load the data
crime_data_model = pd.read_csv('C:\Users\Kayode\Desktop\MSc Data Science UWE\Second Semester\CSCT Masters Project\Dataset\crime_data_model9.csv')

# Prepare the features and target
feature_columns = crime_data_model.columns.drop('HOUR')
X = crime_data_model[feature_columns]
y = crime_data_model['HOUR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)  # 5-fold cross-validation
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

# Predict on training data
train_preds = xgb_model.predict(X_train)

# Evaluate the model on the training data
print("Training Data Evaluation:")
print("Classification Report:")
print(classification_report(y_train, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

# Predict on testing data
test_preds = xgb_model.predict(X_test)

# Evaluate the model on the testing data
print("Testing Data Evaluation:")
print("Classification Report:")
print(classification_report(y_test, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))