In [None]:
#loading data
import pandas as pd
import numpy as np

df = pd.read_csv("/Users/satviksingh/Documents/manas_projects/logistic regression/crime_train.csv")

In [2]:
#preprocessing data 

# Remove useless columns
df = df.drop(columns=["Unnamed: 0", "Num"])

# Convert Yes/No â†’ 1/0
df["closed"] = df["closed"].map({"Yes": 1, "No": 0}).astype(int)

# Convert datetime
df["case_filed"] = pd.to_datetime(df["case_filed"], errors="coerce")
df["year"] = df["case_filed"].dt.year
df["month"] = df["case_filed"].dt.month
df["hour"]  = df["case_filed"].dt.hour
df = df.drop(columns=["case_filed"])

# One-hot encode categorical features
df = pd.get_dummies(df, columns=["city", "crime_description", "sex", "weapon", "domain"])

# Convert to numpy
X = df.drop(columns=["closed"]).values.astype(float)
y = df["closed"].values.reshape(-1, 1)


In [3]:
#some more pre preprocessing 

X_mean = X.mean(axis=0)
X_std = X.std(axis=0) + 1e-8
X = (X - X_mean) / X_std

# Add bias term
m = X.shape[0]
X = np.hstack([np.ones((m, 1)), X])

In [None]:
# 3. SIGMOID + log loss

def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def predict_proba(X, weights):
    return sigmoid(np.dot(X, weights))

def compute_loss(y, y_pred):
    m = len(y)
    eps = 1e-9
    return -(1/m) * np.sum(
        y*np.log(y_pred + eps) + (1-y)*np.log(1-y_pred + eps)
    )


In [5]:
# 4. TRAINING FUNCTION

def train_logistic_regression(X, y, lr=0.0002, epochs=100000):
    m, n = X.shape
    weights = np.zeros((n, 1))

    for epoch in range(epochs):
        y_pred = predict_proba(X, weights)
        gradient = (1/m) * np.dot(X.T, (y_pred - y))
        weights -= lr * gradient

        # Print status every 1000 epochs
        if epoch % 1000 == 0:
            loss = compute_loss(y, y_pred)
            print(f"Epoch {epoch:6d} | Loss: {loss:.6f}")

    return weights


In [6]:
# 5. TRAIN MODEL

print("Starting training...")
weights = train_logistic_regression(X, y)
print("Training complete.")

Starting training...
Epoch      0 | Loss: 0.693147
Epoch   1000 | Loss: 0.693046
Epoch   2000 | Loss: 0.692958
Epoch   3000 | Loss: 0.692880
Epoch   4000 | Loss: 0.692811
Epoch   5000 | Loss: 0.692750
Epoch   6000 | Loss: 0.692696
Epoch   7000 | Loss: 0.692649
Epoch   8000 | Loss: 0.692606
Epoch   9000 | Loss: 0.692568
Epoch  10000 | Loss: 0.692535
Epoch  11000 | Loss: 0.692505
Epoch  12000 | Loss: 0.692478
Epoch  13000 | Loss: 0.692454
Epoch  14000 | Loss: 0.692433
Epoch  15000 | Loss: 0.692414
Epoch  16000 | Loss: 0.692396
Epoch  17000 | Loss: 0.692381
Epoch  18000 | Loss: 0.692367
Epoch  19000 | Loss: 0.692355
Epoch  20000 | Loss: 0.692344
Epoch  21000 | Loss: 0.692334
Epoch  22000 | Loss: 0.692325
Epoch  23000 | Loss: 0.692317
Epoch  24000 | Loss: 0.692310
Epoch  25000 | Loss: 0.692303
Epoch  26000 | Loss: 0.692297
Epoch  27000 | Loss: 0.692292
Epoch  28000 | Loss: 0.692287
Epoch  29000 | Loss: 0.692283
Epoch  30000 | Loss: 0.692279
Epoch  31000 | Loss: 0.692276
Epoch  32000 | Loss

In [14]:
def predict(X, weights, threshold=0.5):
    return (predict_proba(X, weights) >= threshold).astype(int)

y_pred = predict(X, weights)
accuracy = np.mean(y_pred == y)

final_loss = compute_loss(y, predict_proba(X, weights))

print("\n======================= RESULT =======================")
print(f"Final Loss:      {final_loss:.6f}")
print(f"Final Accuracy:  {accuracy:.6f}")
print("======================================================")



Final Loss:      0.692240
Final Accuracy:  0.514474


this test was done with old data 
doing another test with test.csv

In [10]:

# Load new dataset
df_new = pd.read_csv("/Users/satviksingh/Documents/manas_projects/logistic regression/crime_test.csv")

# Preprocessing
df_new = df_new.drop(columns=["Unnamed: 0", "Num"])
df_new["closed"] = df_new["closed"].map({"Yes": 1, "No": 0}).astype(int)

# Convert datetime
df_new["case_filed"] = pd.to_datetime(df_new["case_filed"], errors="coerce")
df_new["year"] = df_new["case_filed"].dt.year
df_new["month"] = df_new["case_filed"].dt.month
df_new["hour"]  = df_new["case_filed"].dt.hour
df_new = df_new.drop(columns=["case_filed"])

# One-hot encode 
df_new = pd.get_dummies(df_new, columns=["city", "crime_description", "sex", "weapon", "domain"])

# Align columns: Test data must have the same features in the same order as training data
# Any categories missing in the test set but present in training must be added as 0s
for col in df.columns:
    if col not in df_new.columns:
        df_new[col] = 0

# Select only the columns that were in the original training set (excluding target)
train_features = [col for col in df.columns if col != "closed"]
X_new = df_new[train_features].values.astype(float)
y_new = df_new["closed"].values.reshape(-1, 1)

# 3. Feature Scaling using TRAINING statistics (X_mean, X_std)
X_new = (X_new - X_mean) / X_std

# Add bias term
m_new = X_new.shape[0]
X_new = np.hstack([np.ones((m_new, 1)), X_new])

# 4. Final Predictions and Comparison
y_pred_new = predict(X_new, weights)
accuracy_new = np.mean(y_pred_new == y_new)

print("\n==================== TEST SET RESULT ===================")
print(f"Test Set Accuracy:   {accuracy_new:.6f}")
print(f"Training Accuracy:   {accuracy:.6f}")
print(f"Accuracy Difference: {abs(accuracy - accuracy_new):.6f}")
print("==========================================================")


Test Set Accuracy:   0.492582
Training Accuracy:   0.514474
Accuracy Difference: 0.021892


poor acuracy 
    almost equivalent ot a coin toss 

    maybe because the data was 
        highly catagorical 
        non linear 

or maybe i made a mistake 

proposed solution 
    use discision trees 
    might work better on such catgorical data 
    
