### Import packages

In [121]:
import numpy as np
import pandas as pd
import requests as requests
from io import StringIO
import csv
import os
from datetime import datetime
import pytz
from time import time
import matplotlib.pyplot as plt

### Prepering raw dataset for analysis
#### All information about dataset you can find in [README.MD](https://github.com/MSI17819/Berlin_bike_theft_prediction/blob/main/README.md) file

In [122]:
# Create empty dataframe
df = pd.DataFrame()

In [123]:
# Data pulling function from url, where we can find data about Berlin bikes thief record from 01/01/2021-up to today.
# The data are updated everyday or at least once a two/three day, so when we run function the date are expand on new records.

def data_pulling_function():
    global df
    url = "https://www.internetwache-polizei-berlin.de/vdb/Fahrraddiebstahl.csv"
    response = requests.get(url)
    df_raw = pd.read_csv(StringIO(response.text))
    local_time = datetime.now(pytz.timezone('Europe/Warsaw'))
    df = df.append(df_raw)
    pd.set_option('display.max_columns', None)
    print('Data update - {}'.format(local_time.strftime("%Y-%m-%d %H:%M:%S")))
    return df

data_pulling_function()

Data update - 2022-07-26 13:20:49


  df = df.append(df_raw)


Unnamed: 0,ANGELEGT_AM,TATZEIT_ANFANG_DATUM,TATZEIT_ANFANG_STUNDE,TATZEIT_ENDE_DATUM,TATZEIT_ENDE_STUNDE,LOR,SCHADENSHOEHE,VERSUCH,ART_DES_FAHRRADS,DELIKT,ERFASSUNGSGRUND
0,25.07.2022,24.07.2022,8,24.07.2022,15,2500832,677,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
1,25.07.2022,25.07.2022,10,25.07.2022,10,5300737,352,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2,25.07.2022,25.07.2022,9,25.07.2022,16,1100206,1249,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
3,25.07.2022,22.07.2022,20,25.07.2022,5,7200413,899,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
4,25.07.2022,24.07.2022,20,25.07.2022,7,6100209,1378,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
...,...,...,...,...,...,...,...,...,...,...,...
31534,02.01.2021,02.01.2021,9,02.01.2021,15,12500930,200,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
31535,01.01.2021,01.01.2021,20,01.01.2021,20,2100104,2800,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
31536,01.01.2021,01.01.2021,9,01.01.2021,11,4200206,290,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
31537,01.01.2021,01.01.2021,5,01.01.2021,8,10400941,2527,Nein,diverse Fahrräder,Keller- und Bodeneinbruch,Sonstiger schwerer Diebstahl in/aus Keller/Bod...


### Preprocesing raw data
##### Change columns name 
##### Merging data category

In [124]:
# Change columns names

df.columns = ["Record_date", "Stealing_date", "Stealing_hour", "Report_stealing_date",
              "Report_stealing_hour", "Berlin_code_area", "Bike_value", "Unsuccesful_attempt",
              "Bike_type", "Crime_type", "Crime_reason"]

# Change data category in Unsuccesful_attempt for three categories and change column name to english

df["Unsuccesful_attempt"].replace({'Nein' : 'No', 'Ja' : 'Yes', 'Unbekannt' : 'Unknown'}, inplace=True)

# Merge two category "Fahrrad" and "diverse Fahrrader" as one category "Bike" and change column name to english

df["Bike_type"].replace({"Herrenfahrrad" : "Men", "Damenfahrrad" : "Women", "Fahrrad" : "Bike", "Mountainbike" : "Mountain",
                         "Kinderfahrrad" : "Child", "diverse Fahrräder" : "Bike", "Rennrad" : "Racing", "Lastenfahrrad" : "Cargo"}, inplace=True)

# Change column Crime_type vales for english

df["Crime_type"].replace(to_replace=["Fahrraddiebstahl", "Keller- und Bodeneinbruch"], value=["Theft", "Break-in"], inplace=True)

# Change type of category crime_reason to english
# Category "aggrevated theft" = theft with break-in, theft with robbery (aggresor has to use force) 

df["Crime_reason"].replace(to_replace=["Sonstiger schwerer Diebstahl von Fahrrädern", 
                                       "Einfacher Diebstahl von Fahrrädern",
                                       "Sonstiger schwerer Diebstahl in/aus Keller/Boden von Fahrrädern", 
                                       "Einfacher Diebstahl aus Keller/Boden von Fahrrädern"],
                           value=["Aggravated theft", "Theft", "Aggravated theft from basement/floor", "Theft from basement/floor"], inplace=True)

#### Changing type of data for date and create seperate columns for day, month, year

In [125]:
# Changing column 'Record_date' type from object to datetime format

df['Record_date'] = pd.to_datetime(df['Record_date'], dayfirst= True)

In [126]:
# Create support column 'date2' for next step spliting

df['date2'] = pd.to_datetime(df['Record_date'], dayfirst= True)

In [127]:
# Create three new columns with: day, month and year data 

df['Record_day'] = df['date2'].dt.day
                                    
df['Record_month'] = df['date2'].dt.month

df['Record_year'] = df['date2'].dt.year

In [128]:
# Moving three new columns for secound, third and fourth position in df

col_to_move = df.pop('Record_day')
col_to_move_1 = df.pop('Record_month')
col_to_move_2 = df.pop('Record_year')

df.insert(1, 'Record_day', col_to_move)
df.insert(2, 'Record_month', col_to_move_1)
df.insert(3, 'Record_year', col_to_move_2)

In [129]:
# Changing 'Stealing_date' type from object to datetime format

df['Stealing_date'] = pd.to_datetime(df['Stealing_date'], dayfirst= True)

In [130]:
# Changing 'Stealing_date' type from object to datetime format

df['Report_stealing_date'] = pd.to_datetime(df['Report_stealing_date'], dayfirst= True)

In [131]:
# Change format into two columns ('Report_stealing_hour', 'Stealing_hour') from object to datetime hour format

df['Stealing_hour'] = pd.to_datetime(df['Stealing_hour'], format='%H').dt.hour

df['Report_stealing_hour'] = pd.to_datetime(df['Report_stealing_hour'],  format='%H').dt.hour 

In [137]:
# Drop support colum date2 from df

df.drop(['date2'], axis=1, inplace=True)

In [138]:
# Display new dataframe

df

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason
0,2022-07-25,25,7,2022,2022-07-24,8,2022-07-24,15,2500832,677,No,Men,Theft,Aggravated theft
1,2022-07-25,25,7,2022,2022-07-25,10,2022-07-25,10,5300737,352,No,Women,Theft,Aggravated theft
2,2022-07-25,25,7,2022,2022-07-25,9,2022-07-25,16,1100206,1249,No,Men,Theft,Aggravated theft
3,2022-07-25,25,7,2022,2022-07-22,20,2022-07-25,5,7200413,899,No,Men,Theft,Aggravated theft
4,2022-07-25,25,7,2022,2022-07-24,20,2022-07-25,7,6100209,1378,No,Men,Theft,Aggravated theft
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31534,2021-01-02,2,1,2021,2021-01-02,9,2021-01-02,15,12500930,200,No,Women,Theft,Aggravated theft
31535,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft
31536,2021-01-01,1,1,2021,2021-01-01,9,2021-01-01,11,4200206,290,No,Women,Theft,Aggravated theft
31537,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor


#### Load csv file with LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit for Berlin) 
#### The dataset was prepared in advance. File contain columns: 
- Berlin_code_area
- Administrative_districe 
- District 
- Quater
- Street_name 

In [139]:
# Load data about LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit which columns: 
# Berlin_code_area, Administrative_districe, District, Quater, Street_name 

url = r"https://raw.githubusercontent.com/MSI17819/Berlin_bike_theft_prediction/main/Berlin_LOR_code.csv"

df_region = pd.read_csv(url, sep=';', encoding="ISO-8859-2")

df_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Berlin_code_area         540 non-null    int64 
 1   Administrative_district  540 non-null    object
 2   District                 540 non-null    object
 3   Quarter                  540 non-null    object
 4   Street_name              540 non-null    object
dtypes: int64(1), object(4)
memory usage: 21.2+ KB


In [141]:
#Join two data frame (left join) with data and LOR dataset
df_merge = pd.merge(df, df_region, how='left', on='Berlin_code_area')

In [143]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-07-25,25,7,2022,2022-07-24,8,2022-07-24,15,2500832,677,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Friedrichshain Ost,Frankfurter Allee Süd FK,Traveplatz
1,2022-07-25,25,7,2022,2022-07-25,10,2022-07-25,10,5300737,352,No,Women,Theft,Aggravated theft,Spandau,Haselhorst / Siemensstadt,Haselhorst,Gartenfelder Straße
2,2022-07-25,25,7,2022,2022-07-25,9,2022-07-25,16,1100206,1249,No,Men,Theft,Aggravated theft,Mitte,Zentrum,Regierungsviertel,Unter den Linden
3,2022-07-25,25,7,2022,2022-07-22,20,2022-07-25,5,7200413,899,No,Men,Theft,Aggravated theft,Tempelhof-Schöneberg,Schöneberg Süd,Schöneberg Südost,Cheruskerstraße
4,2022-07-25,25,7,2022,2022-07-24,20,2022-07-25,7,6100209,1378,No,Men,Theft,Aggravated theft,Steglitz-Zehlendorf,Steglitz,Albrechtstraße,Feuerbachstraße
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31534,2021-01-02,2,1,2021,2021-01-02,9,2021-01-02,15,12500930,200,No,Women,Theft,Aggravated theft,Reinickendorf,Wittenau/Borsigwalde,Nord 2 - Waidmannslust/Wittenau/Lübars,Wittenau Mitte
31535,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz
31536,2021-01-01,1,1,2021,2021-01-01,9,2021-01-01,11,4200206,290,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg West,Heerstraße,Kranzallee
31537,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


### Cleaning data

In [145]:
# Duplicates inspection
df_merge.duplicated().value_counts()

False    31483
True        56
dtype: int64

In [146]:
# Deleting duplicates 
df_merge.drop_duplicates(keep='first', inplace=True)

In [147]:
# Dataframe inspection after deleting duplicated rows
df_merge.duplicated().value_counts()

False    31483
dtype: int64

In [148]:
# Dataframe inspection about missing values
df_merge.isna().sum()

Record_date                0
Record_day                 0
Record_month               0
Record_year                0
Stealing_date              0
Stealing_hour              0
Report_stealing_date       0
Report_stealing_hour       0
Berlin_code_area           0
Bike_value                 0
Unsuccesful_attempt        0
Bike_type                  0
Crime_type                 0
Crime_reason               0
Administrative_district    0
District                   0
Quarter                    0
Street_name                0
dtype: int64

In [149]:
# Inspection about Unsuccesfull_attempt
df_merge['Unsuccesful_attempt'].value_counts()

No         31372
Yes          107
Unknown        4
Name: Unsuccesful_attempt, dtype: int64

In [150]:
# Filter data only for those where thiefs happened (Unsuccesfull_attempt == No), exclude Yes and Unknow category.

df_merge = df_merge[df_merge['Unsuccesful_attempt'] == 'No']

# Reset index and save changes

df_merge.reset_index(drop=True, inplace=True)

In [151]:
# Dispay df after merging

df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-07-25,25,7,2022,2022-07-24,8,2022-07-24,15,2500832,677,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Friedrichshain Ost,Frankfurter Allee Süd FK,Traveplatz
1,2022-07-25,25,7,2022,2022-07-25,10,2022-07-25,10,5300737,352,No,Women,Theft,Aggravated theft,Spandau,Haselhorst / Siemensstadt,Haselhorst,Gartenfelder Straße
2,2022-07-25,25,7,2022,2022-07-25,9,2022-07-25,16,1100206,1249,No,Men,Theft,Aggravated theft,Mitte,Zentrum,Regierungsviertel,Unter den Linden
3,2022-07-25,25,7,2022,2022-07-22,20,2022-07-25,5,7200413,899,No,Men,Theft,Aggravated theft,Tempelhof-Schöneberg,Schöneberg Süd,Schöneberg Südost,Cheruskerstraße
4,2022-07-25,25,7,2022,2022-07-24,20,2022-07-25,7,6100209,1378,No,Men,Theft,Aggravated theft,Steglitz-Zehlendorf,Steglitz,Albrechtstraße,Feuerbachstraße
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31367,2021-01-02,2,1,2021,2021-01-02,9,2021-01-02,15,12500930,200,No,Women,Theft,Aggravated theft,Reinickendorf,Wittenau/Borsigwalde,Nord 2 - Waidmannslust/Wittenau/Lübars,Wittenau Mitte
31368,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz
31369,2021-01-01,1,1,2021,2021-01-01,9,2021-01-01,11,4200206,290,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg West,Heerstraße,Kranzallee
31370,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


In [152]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31372 entries, 0 to 31371
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Record_date              31372 non-null  datetime64[ns]
 1   Record_day               31372 non-null  int64         
 2   Record_month             31372 non-null  int64         
 3   Record_year              31372 non-null  int64         
 4   Stealing_date            31372 non-null  datetime64[ns]
 5   Stealing_hour            31372 non-null  int64         
 6   Report_stealing_date     31372 non-null  datetime64[ns]
 7   Report_stealing_hour     31372 non-null  int64         
 8   Berlin_code_area         31372 non-null  int64         
 9   Bike_value               31372 non-null  int64         
 10  Unsuccesful_attempt      31372 non-null  object        
 11  Bike_type                31372 non-null  object        
 12  Crime_type               31372 n

In [153]:
# Export df to csv file for prediction analysis

df_merge.to_csv(r'C:\Users\dell\Desktop\Berlin_bike_thief_file_for_analysis.csv', 
                sep=',', encoding='utf-8', header=True, index=False)