### Import packages

In [2]:
import numpy as np
import pandas as pd
import requests as requests
from io import StringIO
import csv
import os
from datetime import datetime
import pytz
from time import time
import matplotlib.pyplot as plt

### Prepering raw dataset for analisis
#### All information about dataset you can find in [README.MD](https://github.com/MSI17819/Berlin_bike_theft_prediction/blob/main/README.md) file

In [3]:
# Create empty dataframe
df = pd.DataFrame()

In [4]:
# Data pulling function from url, where we can find data about Berlin bikes thief record from 01/01/2021-up to today.
# The data are updated everyday or at least once a two/three day, so when we run function the date are expand on new records.

def data_pulling_function():
    global df
    url = "https://www.internetwache-polizei-berlin.de/vdb/Fahrraddiebstahl.csv"
    response = requests.get(url)
    df_raw = pd.read_csv(StringIO(response.text))
    local_time = datetime.now(pytz.timezone('Europe/Warsaw'))
    df = df.append(df_raw)
    pd.set_option('display.max_columns', None)
    print('Data update - {}'.format(local_time.strftime("%Y-%m-%d %H:%M:%S")))
    return df

data_pulling_function()

Data update - 2022-07-18 22:54:26


  df = df.append(df_raw)


Unnamed: 0,ANGELEGT_AM,TATZEIT_ANFANG_DATUM,TATZEIT_ANFANG_STUNDE,TATZEIT_ENDE_DATUM,TATZEIT_ENDE_STUNDE,LOR,SCHADENSHOEHE,VERSUCH,ART_DES_FAHRRADS,DELIKT,ERFASSUNGSGRUND
0,17.07.2022,17.07.2022,0,17.07.2022,0,11400929,100,Nein,Lastenfahrrad,Fahrraddiebstahl,Einfacher Diebstahl von Fahrrädern
1,17.07.2022,15.07.2022,19,16.07.2022,18,10200524,604,Nein,Damenfahrrad,Keller- und Bodeneinbruch,Sonstiger schwerer Diebstahl in/aus Keller/Bod...
2,17.07.2022,16.07.2022,18,17.07.2022,4,12601134,4319,Nein,Herrenfahrrad,Keller- und Bodeneinbruch,Sonstiger schwerer Diebstahl in/aus Keller/Bod...
3,17.07.2022,15.07.2022,7,15.07.2022,16,3100103,500,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
4,17.07.2022,17.07.2022,12,17.07.2022,12,8100314,1500,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
...,...,...,...,...,...,...,...,...,...,...,...
30943,02.01.2021,02.01.2021,9,02.01.2021,15,12500930,200,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
30944,01.01.2021,01.01.2021,17,01.01.2021,18,8401241,888,Nein,Fahrrad,Fahrraddiebstahl,Einfacher Diebstahl von Fahrrädern
30945,01.01.2021,01.01.2021,20,01.01.2021,20,2100104,2800,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
30946,01.01.2021,01.01.2021,5,01.01.2021,8,10400941,2527,Nein,diverse Fahrräder,Keller- und Bodeneinbruch,Sonstiger schwerer Diebstahl in/aus Keller/Bod...


### Preprocesing raw data
##### Change columns name 
##### Merging data category

In [5]:
# Change columns names

df.columns = ["Record_date", "Stealing_date", "Stealing_hour", "Report_stealing_date",
              "Report_stealing_hour", "Berlin_code_area", "Bike_value", "Unsuccesful_attempt",
              "Bike_type", "Crime_type", "Crime_reason"]

# Change data category in Unsuccesful_attempt for three categories and change column name to english

df["Unsuccesful_attempt"].replace({'Nein' : 'No', 'Ja' : 'Yes', 'Unbekannt' : 'Unknown'}, inplace=True)

# Merge two category "Fahrrad" and "diverse Fahrrader" as one category "Bike" and change column name to english

df["Bike_type"].replace({"Herrenfahrrad" : "Men", "Damenfahrrad" : "Women", "Fahrrad" : "Bike", "Mountainbike" : "Mountain",
                         "Kinderfahrrad" : "Child", "diverse Fahrräder" : "Bike", "Rennrad" : "Racing", "Lastenfahrrad" : "Cargo"}, inplace=True)

# Change column Crime_type vales for english

df["Crime_type"].replace(to_replace=["Fahrraddiebstahl", "Keller- und Bodeneinbruch"], value=["Theft", "Break-in"], inplace=True)

# Change type of category crime_reason to english
# Category "aggrevated theft" = theft with break-in, theft with robbery (aggresor has to use force) 

df["Crime_reason"].replace(to_replace=["Sonstiger schwerer Diebstahl von Fahrrädern", 
                                       "Einfacher Diebstahl von Fahrrädern",
                                       "Sonstiger schwerer Diebstahl in/aus Keller/Boden von Fahrrädern", 
                                       "Einfacher Diebstahl aus Keller/Boden von Fahrrädern"],
                           value=["Aggravated theft", "Theft", "Aggravated theft from basement/floor", "Theft from basement/floor"], inplace=True)

#### Spliting column and create new three columns contain day, month, year data

In [6]:
## Creat three new columns (day, month, year) from one column (Record_date)
# Copy data frame to df_new
df_new = df.copy()

# Spliting column Record_date in original data frame copy 
df_new_col = df_new["Record_date"].str.split('.', expand=True)

# Concatenation spliting new three row with copy dataframe
df_new_concat_2 = pd.concat([df_new, df_new_col], axis=1)

In [7]:
# Moving three new columns for secound, third and fourth position in DataFrame
col_to_move = df_new_concat_2.pop(0)
col_to_move_1 = df_new_concat_2.pop(1)
col_to_move_2 = df_new_concat_2.pop(2)

df_new_concat_2.insert(1, 0, col_to_move)
df_new_concat_2.insert(2, 1, col_to_move_1)
df_new_concat_2.insert(3, 2, col_to_move_2)

In [8]:
# Rename three new columns as: Record_day, Record_month, Record_year

df_new_concat_2.rename(columns={0: "Record_day", 1: "Record_month", 2 : "Record_year"}, inplace=True)

In [9]:
# Display new dataframe

df_new_concat_2

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason
0,17.07.2022,17,07,2022,17.07.2022,0,17.07.2022,0,11400929,100,No,Cargo,Theft,Theft
1,17.07.2022,17,07,2022,15.07.2022,19,16.07.2022,18,10200524,604,No,Women,Break-in,Aggravated theft from basement/floor
2,17.07.2022,17,07,2022,16.07.2022,18,17.07.2022,4,12601134,4319,No,Men,Break-in,Aggravated theft from basement/floor
3,17.07.2022,17,07,2022,15.07.2022,7,15.07.2022,16,3100103,500,No,Men,Theft,Aggravated theft
4,17.07.2022,17,07,2022,17.07.2022,12,17.07.2022,12,8100314,1500,No,Men,Theft,Aggravated theft
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30943,02.01.2021,02,01,2021,02.01.2021,9,02.01.2021,15,12500930,200,No,Women,Theft,Aggravated theft
30944,01.01.2021,01,01,2021,01.01.2021,17,01.01.2021,18,8401241,888,No,Bike,Theft,Theft
30945,01.01.2021,01,01,2021,01.01.2021,20,01.01.2021,20,2100104,2800,No,Men,Theft,Aggravated theft
30946,01.01.2021,01,01,2021,01.01.2021,5,01.01.2021,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor


#### Load csv file with LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit for Berlin) 
#### The dataset was prepared in advance. File contain columns: 
- Berlin_code_area
- Administrative_districe 
- District 
- Quater
- Street_name 

In [10]:
# Load data about LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit which contain columns: Berlin_code_area, 
#Administrative_districe, District, Quater, Street_name 

url = r"https://raw.githubusercontent.com/MSI17819/Berlin_bike_theft_prediction/main/Berlin_LOR_code.csv"

df_region = pd.read_csv(url, sep=';', encoding="ISO-8859-2")

df_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Berlin_code_area         540 non-null    int64 
 1   Administrative_district  540 non-null    object
 2   District                 540 non-null    object
 3   Quarter                  540 non-null    object
 4   Street_name              540 non-null    object
dtypes: int64(1), object(4)
memory usage: 21.2+ KB


In [11]:
#Join two data frame (left join) with data and LOR dataset
df_merge = pd.merge(df_new_concat_2, df_region, how='left', on='Berlin_code_area')

In [12]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,17.07.2022,17,07,2022,17.07.2022,0,17.07.2022,0,11400929,100,No,Cargo,Theft,Theft,Lichtenberg,Lichtenberg Mitte,Neu-Lichtenberg,Weitlingkiez
1,17.07.2022,17,07,2022,15.07.2022,19,16.07.2022,18,10200524,604,No,Women,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Hellersdorf,Hellersdorf Ost,Adele-Sandrock-Straße
2,17.07.2022,17,07,2022,16.07.2022,18,17.07.2022,4,12601134,4319,No,Men,Break-in,Aggravated theft from basement/floor,Reinickendorf,Märkisches Viertel,MV Nord,Märkisches Zentrum
3,17.07.2022,17,07,2022,15.07.2022,7,15.07.2022,16,3100103,500,No,Men,Theft,Aggravated theft,Pankow,Buch,Buch,Karower Chaussee
4,17.07.2022,17,07,2022,17.07.2022,12,17.07.2022,12,8100314,1500,No,Men,Theft,Aggravated theft,Neukölln,Neukölln,Reuterstraße,Donaustraße
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30943,02.01.2021,02,01,2021,02.01.2021,9,02.01.2021,15,12500930,200,No,Women,Theft,Aggravated theft,Reinickendorf,Wittenau/Borsigwalde,Nord 2 - Waidmannslust/Wittenau/Lübars,Wittenau Mitte
30944,01.01.2021,01,01,2021,01.01.2021,17,01.01.2021,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
30945,01.01.2021,01,01,2021,01.01.2021,20,01.01.2021,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz
30946,01.01.2021,01,01,2021,01.01.2021,5,01.01.2021,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


### Cleaning data

In [13]:
# Duplicates inspection
df_merge.duplicated().value_counts()

False    30896
True        52
dtype: int64

In [14]:
# Deleting duplicates 
df_merge.drop_duplicates(inplace=True)

In [15]:
# Dataframe inspection after deleting duplicated rows
df_merge.duplicated().value_counts()

False    30896
dtype: int64

In [16]:
# Dataframe inspection about missing values
df_merge.isna().sum()

Record_date                0
Record_day                 0
Record_month               0
Record_year                0
Stealing_date              0
Stealing_hour              0
Report_stealing_date       0
Report_stealing_hour       0
Berlin_code_area           0
Bike_value                 0
Unsuccesful_attempt        0
Bike_type                  0
Crime_type                 0
Crime_reason               0
Administrative_district    0
District                   0
Quarter                    0
Street_name                0
dtype: int64

In [17]:
# Inspection about Unsuccesfull_attempt
df_merge['Unsuccesful_attempt'].value_counts()

No         30789
Yes          102
Unknown        5
Name: Unsuccesful_attempt, dtype: int64

In [18]:
# Filter data only for those where thief happened (Unsuccesfull_attempt == No), exclude Yes and Unknow category.
df_merge = df_merge[df_merge['Unsuccesful_attempt'] == 'No']

# Reset index and save changes

df_merge.reset_index(drop=True, inplace=True)

In [19]:
# Dispay dataframe after merging
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,17.07.2022,17,07,2022,17.07.2022,0,17.07.2022,0,11400929,100,No,Cargo,Theft,Theft,Lichtenberg,Lichtenberg Mitte,Neu-Lichtenberg,Weitlingkiez
1,17.07.2022,17,07,2022,15.07.2022,19,16.07.2022,18,10200524,604,No,Women,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Hellersdorf,Hellersdorf Ost,Adele-Sandrock-Straße
2,17.07.2022,17,07,2022,16.07.2022,18,17.07.2022,4,12601134,4319,No,Men,Break-in,Aggravated theft from basement/floor,Reinickendorf,Märkisches Viertel,MV Nord,Märkisches Zentrum
3,17.07.2022,17,07,2022,15.07.2022,7,15.07.2022,16,3100103,500,No,Men,Theft,Aggravated theft,Pankow,Buch,Buch,Karower Chaussee
4,17.07.2022,17,07,2022,17.07.2022,12,17.07.2022,12,8100314,1500,No,Men,Theft,Aggravated theft,Neukölln,Neukölln,Reuterstraße,Donaustraße
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30784,02.01.2021,02,01,2021,02.01.2021,9,02.01.2021,15,12500930,200,No,Women,Theft,Aggravated theft,Reinickendorf,Wittenau/Borsigwalde,Nord 2 - Waidmannslust/Wittenau/Lübars,Wittenau Mitte
30785,01.01.2021,01,01,2021,01.01.2021,17,01.01.2021,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
30786,01.01.2021,01,01,2021,01.01.2021,20,01.01.2021,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz
30787,01.01.2021,01,01,2021,01.01.2021,5,01.01.2021,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


In [25]:
df_merge['Record_date'] = pd.to_datetime(df_merge['Record_date'], format='%d.%m.%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Record_date'] = pd.to_datetime(df_merge['Record_date'], format='%d.%m.%Y')


In [26]:
# Creat column Record_day_of_week with each name of week day 

df_merge['Record_day_of_week'] = df_merge['Record_date'].dt.day_name()

first_column = df_merge.pop('Record_day_of_week')
  
df_merge.insert(4, 'Record_day_of_week', first_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Record_day_of_week'] = df_merge['Record_date'].dt.day_name()


In [27]:
# Creat column Record_mont_of_week with each name of month

df_merge['Record_month_of_year'] = df_merge['Record_date'].dt.month_name()

secound_column = df_merge.pop('Record_month_of_year')
  
df_merge.insert(4, 'Record_month_of_year', secound_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Record_month_of_year'] = df_merge['Record_date'].dt.month_name()


In [28]:
df_merge['Record_date'] = pd.to_datetime(df_merge['Record_date'], dayfirst=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Record_date'] = pd.to_datetime(df_merge['Record_date'], dayfirst=True)


In [29]:
# Changes the format into two columns (Stealing_date, Report_stealing_date) from an object to datetime format

df_merge['Stealing_date'] = pd.to_datetime(df_merge['Stealing_date'], dayfirst=True)

df_merge['Report_stealing_date'] = pd.to_datetime(df_merge['Report_stealing_date'], dayfirst=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Stealing_date'] = pd.to_datetime(df_merge['Stealing_date'], dayfirst=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Report_stealing_date'] = pd.to_datetime(df_merge['Report_stealing_date'], dayfirst=True)


In [30]:
# Changes format into two columns (Report_stealing_hour, Stealing_hour) from intiger to datetime hour format

df_merge['Report_stealing_hour'] = pd.to_datetime(df_merge['Report_stealing_hour'], format='%H').dt.time

df_merge['Stealing_hour'] = pd.to_datetime(df_merge['Stealing_hour'], format='%H').dt.time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Report_stealing_hour'] = pd.to_datetime(df_merge['Report_stealing_hour'], format='%H').dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge['Stealing_hour'] = pd.to_datetime(df_merge['Stealing_hour'], format='%H').dt.time


In [31]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Record_month_of_year,Record_day_of_week,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-07-17,17,07,2022,July,Sunday,2022-07-17,00:00:00,2022-07-17,00:00:00,11400929,100,No,Cargo,Theft,Theft,Lichtenberg,Lichtenberg Mitte,Neu-Lichtenberg,Weitlingkiez
1,2022-07-17,17,07,2022,July,Sunday,2022-07-15,19:00:00,2022-07-16,18:00:00,10200524,604,No,Women,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Hellersdorf,Hellersdorf Ost,Adele-Sandrock-Straße
2,2022-07-17,17,07,2022,July,Sunday,2022-07-16,18:00:00,2022-07-17,04:00:00,12601134,4319,No,Men,Break-in,Aggravated theft from basement/floor,Reinickendorf,Märkisches Viertel,MV Nord,Märkisches Zentrum
3,2022-07-17,17,07,2022,July,Sunday,2022-07-15,07:00:00,2022-07-15,16:00:00,3100103,500,No,Men,Theft,Aggravated theft,Pankow,Buch,Buch,Karower Chaussee
4,2022-07-17,17,07,2022,July,Sunday,2022-07-17,12:00:00,2022-07-17,12:00:00,8100314,1500,No,Men,Theft,Aggravated theft,Neukölln,Neukölln,Reuterstraße,Donaustraße
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30784,2021-01-02,02,01,2021,January,Saturday,2021-01-02,09:00:00,2021-01-02,15:00:00,12500930,200,No,Women,Theft,Aggravated theft,Reinickendorf,Wittenau/Borsigwalde,Nord 2 - Waidmannslust/Wittenau/Lübars,Wittenau Mitte
30785,2021-01-01,01,01,2021,January,Friday,2021-01-01,17:00:00,2021-01-01,18:00:00,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
30786,2021-01-01,01,01,2021,January,Friday,2021-01-01,20:00:00,2021-01-01,20:00:00,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz
30787,2021-01-01,01,01,2021,January,Friday,2021-01-01,05:00:00,2021-01-01,08:00:00,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


In [32]:
df_merge.to_csv(r'C:\Users\dell\Desktop\Project\Berlin_bike_theft_prediction\Berlin_bike_thief_file_for_analysis.csv', sep=',', encoding='utf-8', header=True, index=False)