In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Prepare the Data
crime_data_block = pd.read_csv('/Users/jonathanpoulsen/Documents/Repos/Big-Data-Project-Crimes-in-Chicargo/Crimes_-_2001_to_Present.csv')

In [14]:
# List of columns to keep
columns_to_keep = ['Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 
                   'Arrest', 'Domestic', 'Community Area', 'FBI Code', 
                   'X Coordinate', 'Y Coordinate', 'Year', 'Latitude', 'Longitude']

# Create a new DataFrame with only the columns you want to keep
crime_data_block = crime_data_block[columns_to_keep]


In [15]:
# Check for null values in each column as a percentage of total rows
null_values_percentage = (crime_data_block.isnull().sum() / len(crime_data_block)) * 100

# Print columns with null values
print(null_values_percentage[null_values_percentage > 0])


Location Description    0.131375
Community Area          7.910835
X Coordinate            1.102725
Y Coordinate            1.102725
Latitude                1.102725
Longitude               1.102725
dtype: float64


In [16]:
# Drop rows with any null values
crime_data_block = crime_data_block.dropna()


In [17]:
# Convert columns to 'category' data type
crime_data_block['IUCR'] = crime_data_block['IUCR'].astype('category')
crime_data_block['Community Area'] = crime_data_block['Community Area'].astype('category')
crime_data_block['FBI Code'] = crime_data_block['FBI Code'].astype('category')
crime_data_block['Primary Type'] = crime_data_block['Primary Type'].astype('category')


In [18]:
# Check data types of each column
print(crime_data_block.dtypes)


Date                      object
Block                     object
IUCR                    category
Primary Type            category
Description               object
Location Description      object
Arrest                      bool
Domestic                    bool
Community Area          category
FBI Code                category
X Coordinate             float64
Y Coordinate             float64
Year                       int64
Latitude                 float64
Longitude                float64
dtype: object


In [19]:
# Check for duplicate rows
duplicate_rows = crime_data_block.duplicated()

# Print number of duplicate rows
print(f"Number of duplicate rows: {duplicate_rows.sum()}")


Number of duplicate rows: 17562


In [20]:
# Remove duplicate rows
crime_data_block = crime_data_block.drop_duplicates()


In [21]:
# Reduce the size of the dataset by sampling a subset of the data
subset_size = 100000  # Adjust the subset size as needed
subset_data = crime_data_block.sample(n=subset_size, random_state=0)

# Use the subset data for training
X = subset_data.drop('Block', axis=1)
y = subset_data['Block']


In [22]:
# Step 2: Select Features
selected_features = ['Location Description', 'Primary Type', 'Year', 'Community Area']
X_selected = X[selected_features]

In [23]:
# Step 3: Preprocess the Data
encoder = LabelEncoder()
X_encoded = X_selected.apply(encoder.fit_transform)


In [None]:
# Step 4: Build and Train the Model
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=0)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Step 5: Evaluate the Model
y_pred_train = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Train Accuracy:", train_accuracy)

In [None]:
# Step 6: Test the Model
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", test_accuracy)

In [11]:
# Step 7: Deploy and Use the Model
new_crime_data = pd.read_csv('/Users/jonathanpoulsen/Documents/Repos/Big-Data-Project-Crimes-in-Chicargo/crimes_2022.csv')
X_new = new_crime_data[selected_features]
X_new_encoded = X_new.apply(encoder.transform)
predicted_blocks = model.predict(X_new_encoded)
print("Predicted Blocks:", predicted_blocks)

KeyError: "None of [Index(['Location Description', 'Primary Type', 'Year', 'Community Area'], dtype='object')] are in the [columns]"