# Importing

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Gui
import joblib
import tkinter as tk
from tkinter import ttk

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# Bounus
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# Metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report, confusion_matrix, roc_auc_score


# Data Inspection

In [None]:
df = pd.read_csv("/kaggle/input/crime-prediction-in-chicago-in-2022/Crime Prediction in Chicago_Dataset.csv")
df.head(10).T

In [None]:
df.info()

# Data Cleaning

In [None]:
df.isnull().sum()

- Missing locations

### Fill Location Description with the most frequent value as its a large dataset

In [None]:
df['Location Description'].value_counts().head(10)

In [None]:
df['Location Description'] = df['Location Description'].fillna(df['Location Description'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df_null_ward = df[df.Ward.isnull()]

In [None]:
df_null_ward.T

In [None]:
df_null_ward['District']

In [None]:
df2 = df[df['District']==24]
df2['Ward'].describe()

In [None]:
df2 = df[df['District']==16]
df2['Ward'].describe()

### Set the missing ward based on the district as there is good relationship bet them 

In [None]:
df.loc[df['District'] == 24, 'Ward'] = 49
df.loc[df['District'] == 16, 'Ward'] = 41

In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset=['X Coordinate'], inplace=True)

In [None]:
df.isnull().sum()

- No nulls

# EDA

## Univariant Analysis

In [None]:
df.info()

In [None]:
df.Date

In [None]:
df.Block.value_counts().head(10)

In [None]:
df.info()

In [None]:
df['IUCR'].value_counts().head(10)

In [None]:
df.info()

In [None]:
count_prim = df['Primary Type'].value_counts().head(15)
count_prim

In [None]:
plt.style.use("fivethirtyeight")
plt.figure(figsize=(8,5))
sns.barplot(y=count_prim.index,x=count_prim.values,palette="viridis")
plt.title("TOP 15 Primary Types")
plt.ylabel("Types")
plt.xlabel("count")
plt.show()

In [None]:
count_Des = df['Description'].value_counts().head(10)
count_Des

In [None]:
plt.style.use("fivethirtyeight")
plt.figure(figsize=(6,4))
sns.barplot(y=count_Des.index,x=count_Des.values,palette="viridis")
plt.title("TOP 5 Descriprtion")
plt.ylabel("Description")
plt.xlabel("count")
plt.xticks(rotation=45)
plt.show()

In [None]:
count = df['Arrest'].value_counts()
percentage = df['Arrest'].value_counts(normalize=True)*100
freqTable = pd.DataFrame({'Frequency':count,'Percentage':percentage})
freqTable

Imbalane in Arrest column  (False class > True class)
- which can lead to models that are biased toward the False class, resulting poor preditive performance for the True class. 

In [None]:
plt.figure(figsize=(8, 4))
sns.heatmap(freqTable.T, annot=True, fmt=".1f", cmap="YlGnBu", cbar=False)
plt.title("Arrest")
plt.xlabel('')
plt.show()

In [None]:
count_dom = df['Domestic'].value_counts()
count_dom

In [None]:
# Pie Chart
plt.style.use("ggplot")
plt.figure(figsize=(6, 6))
plt.pie(count_dom, labels=count_dom.index, autopct='%1.1f%%', startangle=90,textprops={"fontweight":"black"},explode=[0, 0.1])
plt.title("Domestic")
plt.legend()
plt.show()

In [None]:
df['Beat'].value_counts()

In [None]:
df['District'].value_counts()

In [None]:
df['Ward'].value_counts().head(20)

In [None]:
df.info()

In [None]:
df['Community Area'].value_counts().head(10)

In [None]:
df['FBI Code'].value_counts()

In [None]:
df = df.drop(columns=['Year','Location'])

- Year has value 2022 only 
- Location has the same value of longitude and latitude

## Bivariant Analysis

In [None]:
df.groupby(['IUCR', 'Primary Type', 'FBI Code']).size().head(20)

In [None]:
df.groupby(['Primary Type'])['Description'].value_counts().head(50)

In [None]:
df.groupby(['Primary Type'])['Arrest'].value_counts(normalize=True).head(10)*100

In [None]:
df.groupby(['Description'])['Arrest'].value_counts(normalize=True).head(10)*100

In [None]:
df.groupby(['Primary Type'])['Arrest'].value_counts(normalize=True).tail(10)*100


In [None]:
df.groupby(['Location Description'])['Arrest'].value_counts(normalize=True).head(20)*100

In [None]:
df.info()

# Preprocessing

## Feature Engineering

### Split Date & Time 

In [None]:
def splitTime(x):
    x = x.split(" ")
    x = x[1]
    return x[0:]
df['Time'] = df['Date'].apply(splitTime)


def splitDate(x):
    x = x.split(" ")
    x = x[0]
    return x[0:]
df.Date = df.Date.apply(splitDate)


In [None]:
df

### Split Block column

In [None]:
def splitBlock(x):
    x = x.split(" ")
    x = x[1:]
    return " ".join(x[0:])
df.Block = df.Block.apply(splitBlock)

## Encoding

In [None]:
columns = df[['Description','Primary Type','Location Description','Block','Date','Time']]
encoders={}
for col in columns:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    encoders[col] = encoder


#joblib.dump(encoders, 'encoders.pkl')

In [None]:
df.info()

In [None]:
correlation_matrix = df.corr(numeric_only=True)
correlation_arrest = correlation_matrix['Arrest'].sort_values(ascending=False)
plt.figure(figsize=(8, 5))
sns.heatmap(correlation_arrest.to_frame(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation with Arrest')
plt.show()

# Features Selection

In [None]:
X = df.drop(columns=['ID','Arrest','Case Number','X Coordinate','Y Coordinate','Latitude','Longitude','IUCR','FBI Code','Community Area','Updated On']) 
y = df['Arrest']

In [None]:
X

## Over-Sampling of data using SMOTE

### Handling Data Imbalancing 


In [None]:
smote = SMOTE(sampling_strategy=0.75,random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
y_resampled.value_counts()

# Data splitting

In [None]:
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,shuffle=True,random_state=57)
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.3,shuffle=True,random_state=57)

# ML Models

### Best Parameters

In [None]:
# import optuna
# def objective(trial):
#     params = {
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
#         'depth': trial.suggest_int('depth', 4, 10),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
#         'iterations': trial.suggest_int('iterations', 100, 500),
#         'subsample': trial.suggest_float('subsample', 0.8, 1.0),
#         'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.8, 1.0),
#         'random_state': 57,
#         'silent': True
#     }

   
#     model = CatBoostClassifier(**params)
#     model.fit(X_train, y_train, eval_set=(X_train, y_train), early_stopping_rounds=50, verbose=False)
#     preds = model.predict(X_train)
#     return accuracy_score(y_train, preds)


# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)


# print("Best Parameters:", study.best_params)
# print("Best Score:", study.best_value)

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=500),
    'Decision Tree Classifier': DecisionTreeClassifier(splitter='best',random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=57),
    'XGBoost': XGBClassifier(n_estimators=100,learning_rate=0.01),
    'CatBoost': CatBoostClassifier(learning_rate=0.08,depth=10,l2_leaf_reg=1.7,iterations=500, verbose=0),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.01,verbose=0)
}

results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    
    print(model_name)
    print("-"*10)
    
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    precision = round(precision_score(y_test, y_pred) * 100, 2)
    recall = round(recall_score(y_test, y_pred) * 100, 2)
    f1 = round(f1_score(y_test, y_pred) * 100, 2)
    roc = round(roc_auc_score(y_test, y_pred_prob) * 100, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', linewidths=.5, cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    print(classification_report(y_test, y_pred))
    results.append({
        'Model': model_name,
        'Accuracy Score': f"{accuracy}%",
        'Precision': f"{precision}%",
        'Recall': f"{recall}%",
        'F1 Score': f"{f1}%",
        'ROC-AUC': f"{roc}%"
    })

results_df = pd.DataFrame(results)
display(results_df)

In [None]:
model = RandomForestClassifier(n_estimators=200, random_state=57)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]


acc = accuracy_score(y_test,y_pred)
confusion = confusion_matrix(y_test,y_pred)
classification_rep = classification_report(y_test,y_pred)
roc = roc_auc_score(y_test,y_pred_prob)


print(f'{round(acc*100)}%')
print(confusion)
print(classification_rep)
print(f'{round(roc*100)}%')


plt.figure(figsize=(6, 4))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', linewidths=.5, cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


# Save the model to a file
#joblib.dump(model, 'Randomforest.pkl')
#print("Model saved successfully!")

# GUI

In [None]:
# # Load encoders and model
# encoders = joblib.load('encoders.pkl')
# model = joblib.load('Randomforest.pkl')
# y_pred_prob = model.predict_proba(X_test)[:, 1]
# def predict():
#     try:
#         # Collect inputs
#         description = description_var.get()
#         primary_type = primary_type_var.get()
#         location_description = location_description_var.get()
#         block = block_var.get()
#         domestic = 1 if domestic_var.get().lower() == "true" else 0
#         ward = float(ward_var.get())
#         date = date_var.get()
#         district = float(district_var.get())
#         beat = float(beat_var.get())
#         time = time_var.get()

#         # Handle date and time conversion
        
#         try:
#             time_parsed = datetime.strptime(time, "%H:%M")
#         except ValueError as e:
#             result_label.config(text=f"Invalid time format. Please use HH:MM. Error: {e}")
#             return
        
#         # Encode categorical inputs using the loaded encoders
#         date_encoded = encoders['Date'].transform([date])[0]
#         description_encoded = encoders['Description'].transform([description])[0]# if description in encoders['Description'].classes_ else -1
#         primary_type_encoded = encoders['Primary Type'].transform([primary_type])[0]# if primary_type in encoders['Primary Type'].classes_ else -1
#         location_description_encoded = encoders['Location Description'].transform([location_description])[0]# if location_description in encoders['Location Description'].classes_ else -1
#         block_encoded = encoders['Block'].transform([block])[0]# if block in encoders['Block'].classes_ else -1
#         time_encoded = encoders['Time'].transform([time])[0]
#         # Prepare input for the model
#         encoded_inputs = [
#             date_encoded,
#             block_encoded,
#             primary_type_encoded,
#             description_encoded,
#             location_description_encoded,
#             domestic,
#             beat,
#             district,
#             ward,
#             time_encoded
#         ]
#         prediction = model.predict([encoded_inputs])[0]
#         probabilities = model.predict_proba([encoded_inputs])[0][1] 
#         probabilities = round(probabilities, 2)  

        
#         result_label.config(text=f'Prediction: {prediction}\nProbability: {probabilities}')

       

#     except Exception as e:
#         result_label.config(text=f"Error: {e}")

# # Create GUI window
# root = tk.Tk()
# root.title("Crime Prediction System")

# # Input fields
# description_var = tk.StringVar()
# primary_type_var = tk.StringVar()
# location_description_var = tk.StringVar()
# block_var = tk.StringVar()
# domestic_var = tk.StringVar()
# ward_var = tk.StringVar()
# date_var = tk.StringVar()
# district_var = tk.StringVar()
# beat_var = tk.StringVar()
# time_var = tk.StringVar()

# fields = [
#     ('Description', description_var),
#     ('Primary Type', primary_type_var),
#     ('Location Description', location_description_var),
#     ('Block', block_var),
#     ('Domestic (Yes/No)', domestic_var),
#     ('Ward', ward_var),
#     ('Date', date_var),
#     ('District', district_var),
#     ('Beat', beat_var),
#     ('Time (HH:MM)', time_var)
# ]

# for i, (label, var) in enumerate(fields):
#     ttk.Label(root, text=label).grid(row=i, column=0, padx=10, pady=5, sticky='w')
#     ttk.Entry(root, textvariable=var).grid(row=i, column=1, padx=10, pady=5)

# # Predict button
# ttk.Button(root, text="Predict", command=predict).grid(row=len(fields), column=0, columnspan=2, pady=10)

# # Result label
# result_label = ttk.Label(root, text="Prediction will appear here.")
# result_label.grid(row=len(fields) + 1, column=0, columnspan=2)

# root.mainloop()