In [12]:
#import packages
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier


In [2]:
# read the data
df = pd.read_csv('first_model.csv')

In [3]:
# Clean and process

#drop residuels
df = df.drop(df[df['value_of_property'] >= 99998].index)
# Drop missing
df = df.dropna(how='any')
# Filter the rows where propertyCrime is equal to 1
propertyCrime_1 = df[df['propertyCrime'] == 1]
# Create dummy variables for the 'type', 'income', 'race', and 'gender' columns
dummy_cols = pd.get_dummies(propertyCrime_1[['type', 'income', 'race', 'gender']])
# Add the dummy variables to the original DataFrame
propertyCrime_1 = pd.concat([propertyCrime_1, dummy_cols], axis=1)
# Drop the original 'type', 'income', 'race', and 'gender' columns
propertyCrime_1 = propertyCrime_1.drop(['type', 'income', 'race', 'gender'], axis=1)
# drop ids
propertyCrime_1 = propertyCrime_1.drop(['IDHH', 'IDPER'], axis=1)

In [4]:
# Get the mean of the 'reported'
reported_mean = propertyCrime_1['reported'].mean()

print(reported_mean)

0.33249489074044963


In [5]:
# Model fitting
X = propertyCrime_1.drop(['reported'], axis=1)
y = propertyCrime_1['reported']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Select the model
base_model = GradientBoostingClassifier()

# Fit the model to the training data
base_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = base_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73


In [6]:
# Try different parameter

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a range of hyperparameters to try
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}
# Create a gradient boosting classifier model
model = GradientBoostingClassifier()

# Use grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new model with the best hyperparameters
best_model = GradientBoostingClassifier(**best_params)

# Fit the best model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy: {accuracy:.2f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Accuracy: 0.73


In [7]:
# Bagging model
Bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10)

# Fit the model to the training data
Bagging_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = Bagging_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.72




In [8]:
# Stacking model

estimators = [
    ('lr', LogisticRegression()),
    ('svc', SVC()),
    ('rf', RandomForestClassifier())
]

stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Fit the model to the training data
stack_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = stack_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 0.73


In [9]:
# Ada model
ada_model = AdaBoostClassifier(n_estimators=100)

ada_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = ada_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73


In [10]:
# Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Create a Random Forest Classifier object
rf = RandomForestClassifier(random_state=42)

# Create a GridSearchCV object with the parameter grid and number of cross-validation folds
grid_search = GridSearchCV(rf, param_grid, cv=5)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new Random Forest Classifier object with the best hyperparameters
rf_best = RandomForestClassifier(**best_params, random_state=42)

# Fit the new model to the training data
rf_best.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = rf_best.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the best hyperparameters and corresponding score
print(f"Best hyperparameters: {best_params}")
print(f"Accuracy: {accuracy:.3f}")

Best hyperparameters: {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 50}
Accuracy: 0.724


In [26]:
logi_model = LogisticRegression()

# Fit the model to the training data
logi_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = logi_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.72


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# Define the parameter grid to search over
param_grid = {
    'hidden_layer_sizes': [(10,), (20,), (10, 10), (20, 20)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01, 0.1]
}

# Create an MLPClassifier object
nw = MLPClassifier(max_iter=1000, random_state=42)

# Create a GridSearchCV object with the parameter grid and number of cross-validation folds
grid_search = GridSearchCV(nw, param_grid, cv=5)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new MLPClassifier object with the best hyperparameters
model_best = MLPClassifier(**best_params, max_iter=1000, random_state=42)

# Fit the new model to the training data
model_best.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = model_best.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the best hyperparameters and corresponding score
print(f"Best hyperparameters: {best_params}")
print(f"Accuracy: {accuracy:.3f}")

Best hyperparameters: {'alpha': 0.01, 'hidden_layer_sizes': (20, 20), 'learning_rate_init': 0.01}
Accuracy: 0.720
