# Chicago Crime

## Daten importieren

In [None]:
from pathlib import Path

PARENT_PATH = str(Path().resolve().parent) + "/"
PATH = "data/"
SUBPATH = "raw/"
FILE = "chicago_crimes"
FORMAT = ".csv"

In [None]:
PARENT_PATH

In [None]:
import pandas as pd

df = pd.read_csv(PARENT_PATH + PATH + SUBPATH + FILE + FORMAT)

## Datenüberblick

In [None]:
df

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.info()

## Daten anpassen

### Spaltennamen

In [None]:
# Spaltennamen umbenennen
df = df.rename(columns={ 
            "ID": "id",
            "Case Number": "case_number",
            "Date": "date",
            "Block": "block",
            "IUCR": "iucr",
            "Primary Type": "primary_type",
            "Description": "description",
            "Location Description": "location_description",
            "Arrest": "arrest",
            "Domestic": "domestic",
            "Beat": "beat",
            "District": "district",
            "Ward": "ward",
            "Community Area": "community_area",
            "Year": "year",
            "Latitude": "latitude",
            "Longitude": "longitude"
}, errors="raise")

In [None]:
df.info()

## Variablen anpassen

### Nicht benötigte Variablen löschen

"case_number" und "iucr" helfen nicht weiter, da es schon eine "id" gibt 

In [None]:
df = df.drop("case_number", axis=1)
df = df.drop("iucr", axis=1)

### Datum und Zeit anpassen / hinzufügen

In [None]:
#date in datetime umwandeln

df["date"] = pd.to_datetime(df.date)

In [None]:
df["month"] = df["date"].dt.month

In [None]:
df["day"] = df["date"].dt.day

In [None]:
df["hour"] = df["date"].dt.hour

In [None]:
df.head()

In [None]:
df.dtypes

### Nominale Variablen

In [None]:
# Liste mit nominalen Variablen erstellen

list_nominal = ["arrest", "domestic"]


In [None]:
# Nominale Variablen als kategorial formatieren (category)
for i in list_nominal:
    df[i] = df[i].astype("category")

In [None]:
# Nominale Variablen anzeigen lassen
df[list_nominal].info()

In [None]:
df[list_nominal].head()

In [None]:
dummy_nominal = pd.get_dummies(df[list_nominal],  prefix_sep='__').astype('category')
dummy_nominal

In [None]:
# Dummy-Variablen hinzufügen
df = df.join(dummy_nominal)

In [None]:
df.head()

In [None]:
df.info()

### Categoriale Variablen (Allgemein)

In [None]:
#primary_type und descripton noch einmal gesondert ansehen
cat_list = ["id","block","primary_type", "description", "location_description", "beat", "district", "ward", "community_area", "year", "month", "day", "hour"]

In [None]:
# Variablen als kategorial formatieren (category)
for i in cat_list:
    df[i] = df[i].astype("category")

In [None]:
df.info()

In [None]:
print(df["primary_type"].unique())

In [None]:
print(df["primary_type"].value_counts())

In [None]:
#häufige Straftaten

df_primary_type = df[~(df['primary_type'] == "THEFT") 
    & ~(df['primary_type'] == "BATTERY") 
    & ~(df['primary_type'] == "CRIMINAL DAMAGE") 
    & ~(df['primary_type'] == "ASSAULT") 
    & ~(df['primary_type'] == "DECEPTIVE PRACTICE")
    & ~(df['primary_type'] == "OTHER OFFENSE")
    & ~(df['primary_type'] == "NARCOTICS")
    & ~(df['primary_type'] == "BURGLARY")
    & ~(df['primary_type'] == "MOTOR VEHICLE THEFT")
    & ~(df['primary_type'] == "ROBBERY")
    & ~(df['primary_type'] == "CRIMINAL TRESPASS")
    & ~(df['primary_type'] == "WEAPONS VIOLATION")
    & ~(df['primary_type'] == "SEX OFFENSE")
    & ~(df['primary_type'] == "CRIM SEXUAL ASSAULT")
    & ~(df['primary_type'] == "PROSTITUTION")
    & ~(df['primary_type'] == "CRIMINAL SEXUAL ASSAULT")
    & ~(df['primary_type'] == "HOMICIDE")

].index

In [None]:
df = df.drop(df_primary_type)

In [None]:
df = df[df.primary_type.notnull()]

In [None]:
df.info()

In [None]:
print(df["primary_type"].value_counts())

In [None]:
#df.drop(df["primary_type"].value_counts() <= 0)



In [None]:

#value_counts = df["primary_type"].value_counts()
#to_remove = value_counts[value_counts <= 0].index
#df = df[~df["primary_type"].isin(to_remove)]
#df = df[~df.primary_type.isin(to_remove)]

In [None]:
#df = df[~df['primary_type'].isin(counts[counts < 0].index)]

In [None]:
#counts = df['primary_type'].value_counts()

#new_df = df.loc[df['primary_type'].isin(counts.index[counts >= 300])]

In [None]:
#print(new_df["primary_type"].value_counts())

In [None]:
#new_df.head(2)

In [None]:
# rename and summarise "primary_type"

df['primary_type'] = df['primary_type'].replace(
    ["THEFT", "MOTOR VEHICLE THEFT", "BATTERY", "ASSAULT", "DECEPTIVE PRACTICE", "NARCOTICS", "ROBBERY", "WEAPONS VIOLATION", "CRIMINAL DAMAGE", "SEX OFFENSE", "CRIM SEXUAL ASSAULT", "CRIMINAL SEXUAL ASSAULT", "PROSTITUTION", "BURGLARY", "CRIMINAL TRESPASS", "OTHER OFFENSE", "HOMICIDE"], 
["theft", "theft", "assault_and_battery", "assault_and_battery","deceptive_practice", "narcotics", "robbery_and_weapons", "robbery_and_weapons", "criminal_damage", "sexual_crime", "sexual_crime", "sexual_crime", "sexual_crime", "burglary", "burglary", "other_offense", "homicide"]
)

In [None]:
# rename and summarise "primary_type"
"""
df_1 = df["primary_type"].rename({
    "THEFT" : "theft",
    "MOTOR VEHICLE THEFT" : "theft",
    "BATTERY" : "assault_and_battery",
    "ASSAULT" : "assault_and_battery",
    "DECEPTIVE PRACTICE" : "deceptive_practice",
    "NARCOTICS" : "narcotics",
    "ROBBERY" : "robbery_and_weapons",
    "WEAPONS VIOLATION" : "robbery_and_weapons",
    "CRIMINAL DAMAGE" : "criminal_damage",
    "SEX OFFENSE" : "sexual_crime",
    "CRIM SEXUAL ASSAULT" : "sexual_crime",
    "PROSTITUTION" : "sexual_crime",
    "CRIMINAL SEXUAL ASSAULT" : "sexual_crime",
    "BURGLARY" : "burglary",
    "CRIMINAL TRESPASS" : "burglary",
    "OTHER OFFENSE" : "other_offense"
})
"""

In [None]:
print(df["primary_type"].value_counts())

In [None]:
"""
df = df["primary_type"].replace(
    "THEFT" , "theft",
    "MOTOR VEHICLE THEFT" , "theft",
    "BATTERY" , "assault_and_battery",
    "ASSAULT" , "assault_and_battery",
    "DECEPTIVE PRACTICE " , "deceptive_practice",
    "NARCOTIVS" , "narcotics",
    "ROBBERY" , "robbery_and_weapons",
    "WEAPONS VIOLATION" , "robbery_and_weapons",
    "CRIMINAL DAMAGE" , "criminal_damage",
    "SEX OFFENSE" , "sexual_crime",
    "CRIM SEXUAL ASSAULT" , "sexual_crime",
    "PROSTITUTION" , "sexual_crime",
    "CRIMINAL SEXUAL ASSAULT" , "sexual_crime",
    "BURGLARY" , "burglary",
    "CRIMINAL TRESPASS" , "burglary",
    "OTHER OFFENSE" , "other_offense"
)
"""

In [None]:
#df.drop(df["primary_type"].value_counts() >= 1000, inplace=True)

In [None]:
#df.drop(df[df.primary_type.value_counts() <= 100].index, inplace=True)

In [None]:
print(df["description"].unique())

In [None]:
print(df["description"].value_counts())

In [None]:
print(df["location_description"].unique())

In [None]:
print(df["location_description"].value_counts())

### "Description" und "Location Description" entfernen

Da die ausführliche Beschreibung der Tat und der Location nicht notwendig für die Beantwortung unserer Fragestellung ist werden wir sie aus dem Datensatz entfernen.

In [None]:
df = df.drop("description", axis=1)
df = df.drop("location_description", axis=1)

In [None]:
df.info()

# Datensatz temporär speichern

In [None]:
import time

TIME = "-" + time.strftime("%Y%m%d-%H%M")

In [None]:
SUBPATH = "interim/"

In [None]:
#df.to_csv(PARENT_PATH + PATH + SUBPATH + FILE + TIME + FORMAT, index=False)