In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE, RandomOverSampler

# Load dataset (Handle bad lines)
df_crime = pd.read_csv("/kaggle/input/police2007/Chicago_Crimes_2005_to_2007.csv", on_bad_lines="skip")

# Drop missing values for required columns
df_crime = df_crime.dropna(subset=['Latitude', 'Longitude', 'Primary Type', 'District', 'FBI Code'])

# Encode Crime Types
encoder = LabelEncoder()
df_crime['Crime Type Encoded'] = encoder.fit_transform(df_crime['Primary Type'])

# Convert categorical features
df_crime['FBI Code'] = LabelEncoder().fit_transform(df_crime['FBI Code'])

# Feature Engineering: Use More Predictive Features
features = df_crime[['Latitude', 'Longitude', 'District', 'Arrest', 'FBI Code']]

# Convert to correct data types
features['Latitude'] = features['Latitude'].astype(float)
features['Longitude'] = features['Longitude'].astype(float)
features['District'] = features['District'].astype(int)
features['Arrest'] = features['Arrest'].astype(int)
features['FBI Code'] = features['FBI Code'].astype(int)

# Fill any remaining NaN values
features = features.fillna(0)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, df_crime['Crime Type Encoded'], test_size=0.2, random_state=42, stratify=df_crime['Crime Type Encoded'])

print("✅ Data Loaded & Preprocessed Successfully!")

# Check if SMOTE will work (at least 6 samples per class needed)
min_samples_per_class = y_train.value_counts().min()

if min_samples_per_class >= 6:
    print("🔄 Using SMOTE for balancing dataset...")
    smote = SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
else:
    print("🔄 Using Random Over Sampling (SMOTE not possible)...")
    ros = RandomOverSampler(random_state=42)
    X_train_balanced, y_train_balanced = ros.fit_resample(X_train, y_train)

print("✅ Dataset Balanced Successfully!")

# Convert to correct numeric format
X_train_balanced = X_train_balanced.astype(float)
X_test = X_test.astype(float)

# Convert to XGBoost DMatrix
dtrain = xgb.DMatrix(X_train_balanced, label=y_train_balanced, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

# Optimized XGBoost GPU Parameters
params = {
    'objective': 'multi:softmax',
    'num_class': len(df_crime['Crime Type Encoded'].unique()),
    'tree_method': 'gpu_hist',  # Enables GPU
    'predictor': 'gpu_predictor',  # Forces GPU usage
    'gpu_id': 0,  # Assign GPU
    'eval_metric': 'mlogloss',
    'learning_rate': 0.1,
    'max_depth': 12,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 3,
    'lambda': 2.0,  # L2 regularization
    'alpha': 1.0   # L1 regularization
}

# Train XGBoost Model on GPU
model_xgb = xgb.train(params, dtrain, num_boost_round=300)

# Evaluate Model
y_pred_xgb = model_xgb.predict(dtest)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

print(f"🚀 Optimized XGBoost GPU Accuracy: {xgb_accuracy:.4f}")

# Sample Location (Latitude, Longitude, District, Arrest Status, FBI Code)
sample_location = pd.DataFrame([[41.87, -87.62, 12, 0, 14]], columns=['Latitude', 'Longitude', 'District', 'Arrest', 'FBI Code'])

# Convert to XGBoost DMatrix
dsample = xgb.DMatrix(sample_location.astype(float), enable_categorical=True)  # Ensure correct type

# Predict Crime Type using XGBoost GPU Model
predicted_crime_xgb = model_xgb.predict(dsample)

# Convert Prediction Back to Crime Name
predicted_crime_name = encoder.inverse_transform([int(predicted_crime_xgb[0])])

print(f"🔍 Predicted Crime Type for Location (41.87, -87.62): {predicted_crime_name[0]}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Latitude'] = features['Latitude'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Longitude'] = features['Longitude'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['District'] = features['District'].astype(int)
A value is trying to be set on 

✅ Data Loaded & Preprocessed Successfully!
🔄 Using SMOTE for balancing dataset...
✅ Dataset Balanced Successfully!



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



🚀 Optimized XGBoost GPU Accuracy: 0.9504
🔍 Predicted Crime Type for Location (41.87, -87.62): DECEPTIVE PRACTICE
