In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset from Excel
file_path = r"C:\Users\ksjav\Downloads\SUITS\Data Analytics and ML\The_Case_of_the_Predictive_Crime_Solver.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Select relevant features and target variable
selected_features = ['Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Weapon Used Cd', 'LAT', 'LON', 'Crm Cd Desc']
target_variable = 'Crm Cd'  # Assuming Crm Cd is the crime code or type you want to predict

# Filter the dataset to include only selected features and target variable
df_selected = df[selected_features + [target_variable]].dropna()  # Drop rows with NaN values if any

# Separate features (X) and target variable (y)
X = df_selected.drop(columns=[target_variable, 'Crm Cd Desc'])  # Exclude non-numeric features and target labels
y = df_selected[target_variable]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
# Define preprocessing for numeric features (scaling and imputing missing values)
numeric_features = ['Vict Age', 'Weapon Used Cd', 'LAT', 'LON']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features (encoding)
categorical_features = ['Vict Sex', 'Vict Descent', 'Premis Desc']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Append classifier to preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', DecisionTreeClassifier())])


In [37]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': randint(2, 11),  # Random integer values between 2 and 10
    'classifier__min_samples_leaf': randint(1, 5)     # Random integer values between 1 and 4
}

# Perform Randomized Search CV
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: {:.2f}".format(random_search.best_score_))

# Get the best model
best_model = random_search.best_estimator_

# Predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))




Best parameters found:  {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 6}
Best cross-validation score: 0.45
Accuracy: 0.47
Classification Report:
              precision    recall  f1-score   support

         110       0.00      0.00      0.00        21
         113       0.00      0.00      0.00         1
         121       0.38      0.07      0.12        67
         122       0.00      0.00      0.00         6
         210       0.46      0.21      0.29       444
         220       0.00      0.00      0.00        78
         230       0.51      0.67      0.58       749
         231       0.60      0.39      0.47        23
         235       0.00      0.00      0.00         9
         236       0.00      0.00      0.00       183
         237       0.00      0.00      0.00         5
         250       0.00      0.00      0.00         7
         251       0.67      0.43      0.52        37
         310       0.51      0.33      0.40   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# Example test instance (simulated based on your dataset structure)
test_instance = {
    'Vict Age': 34,
    'Vict Sex': 'Male',
    'Vict Descent': 'Hispanic',
    'Premis Desc': 'Residence',
    'Weapon Used Cd': 400,
    'LAT': 34.0522,
    'LON': -118.2437
}

# Prepare the test instance for prediction
test_df = pd.DataFrame([test_instance])

# Make predictions
predicted_crime_code = best_model.predict(test_df)

# Print the predicted crime code
print(f"Predicted crime code: {predicted_crime_code[0]}")

# Interpret the predicted crime code using your dataset's definitions
crime_description = df.loc[df['Crm Cd'] == predicted_crime_code[0], 'Crm Cd Desc'].iloc[0]
print(f"Crime description: {crime_description}")

Predicted crime code: 626
Crime description: INTIMATE PARTNER - SIMPLE ASSAULT
