In [1]:
# Import needed libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [2]:
# Read the training data
train_data = pd.read_csv('crime_train.csv')
# Read the tesing data
test_data = pd.read_csv('crime_test.csv')

In [3]:
def feature_engineering(df):
    """
    Perform feature engineering on the DataFrame.
    
    Parameters:
    - df: Pandas DataFrame containing 'Dates' column
    
    This function converts the 'Dates' column to DateTime format and extracts
    'day', 'month', and 'year' features. Then, drops the 'Dates' column.
    """
    df['Dates'] = pd.to_datetime(df['Dates'])
    values = ['day', 'month', 'year']
    for value in values:
        df[value] = getattr(df['Dates'].dt, value)
    df.drop(columns='Dates', inplace=True)

def features_pre_processing(df):
    """
    Preprocess features in the DataFrame.
    
    Parameters:
    - df: Pandas DataFrame containing 'DayOfWeek' and 'PdDistrict' columns
    
    This function encodes 'DayOfWeek' and 'PdDistrict' using LabelEncoder,
    and drops these columns after encoding.
    """
    le = LabelEncoder()
    df['day_of_week'] = le.fit_transform(df['DayOfWeek'])
    df['district_no'] = le.fit_transform(df['PdDistrict'])
    df.drop(columns=['DayOfWeek', 'PdDistrict'], inplace=True)

def target_pre_processing(df):
    """
    Preprocess the target variable in the DataFrame.
    
    Parameters:
    - df: Pandas DataFrame containing 'Category' column
    
    This function encodes the 'Category' column (target variable) using LabelEncoder.
    """
    le = LabelEncoder()
    df['Category'] = le.fit_transform(df['Category'])

In [4]:
# Perform feature engineering on the training data
feature_engineering(train_data)

# Perform feature engineering on the test data
feature_engineering(test_data)

In [5]:
# Preprocess the target variable in the training data
target_pre_processing(train_data)

# Preprocess the features in the training data
features_pre_processing(train_data)

# Preprocess the features in the test data
features_pre_processing(test_data)

In [6]:
# Prepare training data: 
# Exclude 'Descript', 'Resolution', 'Address' columns and assign to X_train
X_train = train_data.drop(columns=['Descript', 'Resolution', 'Address', 'Category'])

# Set 'Category' as the target variable for training
y_train = train_data['Category']

# Prepare test data: 
# Exclude 'Address', 'Id' columns and assign to X_test
X_test = test_data.drop(columns=['Address', 'Id'])

In [7]:
# Create an XGBoost Classifier model and fit it with the training data
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

In [8]:
# Predict the probabilities of classes for the test data using the trained model
y_pred = model.predict_proba(X_test)

In [9]:
# Extract the 'Id' column from the test data for the submission
sub_column = test_data['Id']

In [10]:
# Create a DataFrame for submission with 'Id' column using the extracted 'Id' data
submission = pd.DataFrame(columns=['Id'], data=sub_column)

In [11]:
# Concatenate the 'Id' column with the predicted probabilities for different crime categories
submission = pd.concat([submission, pd.DataFrame(y_pred, columns=["ARSON", "ASSAULT", "BAD CHECKS", "BRIBERY", "BURGLARY", "DISORDERLY CONDUCT", "DRIVING UNDER THE INFLUENCE", "DRUG/NARCOTIC", "DRUNKENNESS", "EMBEZZLEMENT", "EXTORTION", "FAMILY OFFENSES", "FORGERY/COUNTERFEITING", "FRAUD", "GAMBLING", "KIDNAPPING", "LARCENY/THEFT", "LIQUOR LAWS", "LOITERING", "MISSING PERSON", "NON-CRIMINAL", "OTHER OFFENSES", "PORNOGRAPHY/OBSCENE MAT", "PROSTITUTION", "RECOVERED VEHICLE", "ROBBERY", "RUNAWAY", "SECONDARY CODES", "SEX OFFENSES FORCIBLE", "SEX OFFENSES NON FORCIBLE", "STOLEN PROPERTY", "SUICIDE", "SUSPICIOUS OCC", "TREA", "TRESPASS", "VANDALISM", "VEHICLE THEFT", "WARRANTS", "WEAPON LAWS"])], axis=1)

In [12]:
# Save the submission DataFrame to a CSV file for Kaggle competition submission
submission.to_csv('kaggle_crime.csv', index=False)