# Required Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

# Importing and basic information about the dataset

In [None]:
df = pd.read_csv("Crime_Incidents_in_2024.csv")

print("Columns names in the dataset:\n")
print(df.columns)
print("\n--------------------------------------------------------------------------\n")

print("Schema Info and non-null counts:\n")
print(df.info())
print("\n--------------------------------------------------------------------------\n")

print("Data Description of each column:\n")
print(df.describe(include="all"))
print("\n--------------------------------------------------------------------------\n")

# Missing values information

In [None]:
print("Missing values count and percentages:\n")
print(pd.DataFrame({
    'Missing Count': df.isnull().sum(),
    'Missing Percentage': df.isnull().mean() * 100
}))
print("\n--------------------------------------------------------------------------\n")

# Duplicate rows information

In [None]:
print("Duplicate rows count and percentages:\n")
print(df.duplicated().sum())
print("\n--------------------------------------------------------------------------\n")

df = df.drop_duplicates()

# Varience/Data Distribution of the dataset

In [None]:
print("Data variance of each column:\n")
temp = df.select_dtypes(include=['number'])
print(temp.var())
print("\n--------------------------------------------------------------------------\n")

temp.hist(figsize=(30, 15), bins=100)
plt.show()


# Relationship Between Variables usin Analysis of Variance

In [None]:
categorical_columns = ['SHIFT', 'METHOD', 'OFFENSE', 'WARD', 'DISTRICT', 'PSA', 'NEIGHBORHOOD_CLUSTER', 'BLOCK_GROUP', 'VOTING_PRECINCT', 'BID']
numerical_columns = ['X', 'Y', 'XBLOCK', 'YBLOCK', 'LATITUDE', 'LONGITUDE']

df_clean = df.dropna(subset=categorical_columns + numerical_columns)

anova_results = []

for cat_col in categorical_columns:
    for num_col in numerical_columns:
        if df_clean[cat_col].nunique() > 1:
            groups = [df_clean[df_clean[cat_col] == cat_value][num_col] for cat_value in df_clean[cat_col].unique()]
            try:
                stat, p_value = f_oneway(*groups)
                anova_results.append({'Categorical Variable': cat_col, 'Numerical Variable': num_col, 
                                      'F-value': stat, 'P-Value': p_value})
            except:
                pass

pd.options.display.float_format = '{:.5f}'.format

anova_df = pd.DataFrame(anova_results)

print(anova_df)

pd.reset_option('display.float_format')


# Class Imbalance Check

In [None]:
columns = ['SHIFT', 'METHOD', 'OFFENSE']

for col in columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=df[col])
    plt.xticks(rotation=45)
    plt.title(f'Count Plot of {col}')
    plt.show()