### Import packages

In [51]:
import numpy as np
import pandas as pd
import requests as requests
from io import StringIO
import csv
import os
from datetime import datetime
import pytz
from time import time
import matplotlib.pyplot as plt

### Prepering raw dataset for analysis
#### All information about dataset you can find in [README.MD](https://github.com/MSI17819/Berlin_bike_theft_prediction/blob/main/README.md) file

#### Create empty dataframe

In [52]:
df = pd.DataFrame()

#### Data pulling function from url adress, where we can find data about Berlin bikes thief record from 01/01/2021-up to today.
#### The data are updated everyday or at least once a two/three day, so when we run function the data are expanding on new records.

In [53]:
def data_pulling_function():
    global df
    url = "https://www.internetwache-polizei-berlin.de/vdb/Fahrraddiebstahl.csv"
    response = requests.get(url)
    df_raw = pd.read_csv(StringIO(response.text))
    local_time = datetime.now(pytz.timezone('Europe/Warsaw'))
    df = df.append(df_raw)
    pd.set_option('display.max_columns', None)
    print('Data update - {}'.format(local_time.strftime("%Y-%m-%d %H:%M:%S")))
    return df

data_pulling_function()

Data update - 2022-09-13 19:52:03


  df = df.append(df_raw)


Unnamed: 0,ANGELEGT_AM,TATZEIT_ANFANG_DATUM,TATZEIT_ANFANG_STUNDE,TATZEIT_ENDE_DATUM,TATZEIT_ENDE_STUNDE,LOR,SCHADENSHOEHE,VERSUCH,ART_DES_FAHRRADS,DELIKT,ERFASSUNGSGRUND
0,12.09.2022,10.09.2022,14,10.09.2022,18,1100309,600,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
1,12.09.2022,11.09.2022,17,12.09.2022,6,4300517,1068,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2,12.09.2022,10.09.2022,18,12.09.2022,8,3300411,300,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
3,12.09.2022,11.09.2022,20,12.09.2022,7,9100408,837,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
4,12.09.2022,11.09.2022,17,11.09.2022,23,1100206,3289,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
...,...,...,...,...,...,...,...,...,...,...,...
35020,02.01.2021,02.01.2021,9,02.01.2021,15,12500930,200,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
35021,01.01.2021,01.01.2021,9,01.01.2021,11,4200206,290,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
35022,01.01.2021,01.01.2021,17,01.01.2021,18,8401241,888,Nein,Fahrrad,Fahrraddiebstahl,Einfacher Diebstahl von Fahrrädern
35023,01.01.2021,01.01.2021,5,01.01.2021,8,10400941,2527,Nein,diverse Fahrräder,Keller- und Bodeneinbruch,Sonstiger schwerer Diebstahl in/aus Keller/Bod...


### Preprocesing raw data
- Change columns name 
-  Merging data category

#### Change columns names

In [54]:
df.columns = ["Record_date", "Stealing_date", "Stealing_hour", "Report_stealing_date",
              "Report_stealing_hour", "Berlin_code_area", "Bike_value", "Unsuccesful_attempt",
              "Bike_type", "Crime_type", "Crime_reason"]

#### Change data category in *Unsuccesful_attempt* for three categories

In [55]:
df["Unsuccesful_attempt"].replace({'Nein' : 'No', 'Ja' : 'Yes', 'Unbekannt' : 'Unknown'}, inplace=True)

#### Merge two category *Fahrrad* and *diverse Fahrrader* from *Bike_type* column as one category *Bike*

In [56]:
df["Bike_type"].replace({"Herrenfahrrad" : "Men", "Damenfahrrad" : "Women", "Fahrrad" : "Bike", 
                         "Mountainbike" : "Mountain", "Kinderfahrrad" : "Child", 
                         "diverse Fahrräder" : "Bike", "Rennrad" : "Racing", "Lastenfahrrad" : "Cargo"}, inplace=True)

#### Change column *Crime_type* vales

In [57]:
df["Crime_type"].replace(to_replace=["Fahrraddiebstahl", "Keller- und Bodeneinbruch"], 
                         value=["Theft", "Break-in"], inplace=True)

#### Change type of category *Crime_reason*
#### Category *aggrevated theft* = theft with break-in, theft with robbery (aggresor has to use force) 

In [58]:
df["Crime_reason"].replace(to_replace=["Sonstiger schwerer Diebstahl von Fahrrädern", 
                                       "Einfacher Diebstahl von Fahrrädern",
                                       "Sonstiger schwerer Diebstahl in/aus Keller/Boden von Fahrrädern", 
                                       "Einfacher Diebstahl aus Keller/Boden von Fahrrädern"],
                           value=["Aggravated theft", "Theft", "Aggravated theft from basement/floor", 
                                  "Theft from basement/floor"], inplace=True)

#### Changing column *Record_date* type from object to datetime format

In [59]:
df['Record_date'] = pd.to_datetime(df['Record_date'], dayfirst= True)

#### Create support column *date2* for next step spliting

In [60]:
df['date2'] = pd.to_datetime(df['Record_date'], dayfirst= True)

#### Create three new columns with: day, month and year

In [61]:
df['Record_day'] = df['date2'].dt.day
                                    
df['Record_month'] = df['date2'].dt.month

df['Record_year'] = df['date2'].dt.year

#### Moving three new columns for secound, third and fourth position in DataFrame

In [62]:
col_to_move = df.pop('Record_day')
col_to_move_1 = df.pop('Record_month')
col_to_move_2 = df.pop('Record_year')

df.insert(1, 'Record_day', col_to_move)
df.insert(2, 'Record_month', col_to_move_1)
df.insert(3, 'Record_year', col_to_move_2)

#### Changing *Stealing_date* type from object to datetime format

In [63]:
df['Stealing_date'] = pd.to_datetime(df['Stealing_date'], dayfirst= True)

#### Changing *Stealing_date* type from object to datetime format

In [64]:
df['Report_stealing_date'] = pd.to_datetime(df['Report_stealing_date'], dayfirst= True)

#### Change format into two columns (*Report_stealing_hour'*, *Stealing_hour*) from object to datetime hour format

In [65]:
df['Stealing_hour'] = pd.to_datetime(df['Stealing_hour'], format='%H').dt.hour

df['Report_stealing_hour'] = pd.to_datetime(df['Report_stealing_hour'],  format='%H').dt.hour 

#### Drop support colum *date2* from DataFrame

In [66]:
df.drop(['date2'], axis=1, inplace=True)

#### Display new DataFrame

In [67]:
df

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason
0,2022-09-12,12,9,2022,2022-09-10,14,2022-09-10,18,1100309,600,No,Men,Theft,Aggravated theft
1,2022-09-12,12,9,2022,2022-09-11,17,2022-09-12,6,4300517,1068,No,Women,Theft,Aggravated theft
2,2022-09-12,12,9,2022,2022-09-10,18,2022-09-12,8,3300411,300,No,Men,Theft,Aggravated theft
3,2022-09-12,12,9,2022,2022-09-11,20,2022-09-12,7,9100408,837,No,Men,Theft,Aggravated theft
4,2022-09-12,12,9,2022,2022-09-11,17,2022-09-11,23,1100206,3289,No,Women,Theft,Aggravated theft
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35020,2021-01-02,2,1,2021,2021-01-02,9,2021-01-02,15,12500930,200,No,Women,Theft,Aggravated theft
35021,2021-01-01,1,1,2021,2021-01-01,9,2021-01-01,11,4200206,290,No,Women,Theft,Aggravated theft
35022,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft
35023,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor


#### Load csv file with LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit for Berlin) 
#### The dataset was prepared in advance. File contain columns: 
- *Berlin_code_area*
- *Administrative_district*
- *District* 
- *Quater*
- *Street_name* 

#### Load data about LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit

In [68]:
url = r"https://raw.githubusercontent.com/MSI17819/Berlin_bike_theft_prediction/main/Berlin_LOR_code.csv"

df_region = pd.read_csv(url, sep=';', encoding="ISO-8859-2")

df_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Berlin_code_area         541 non-null    int64 
 1   Administrative_district  541 non-null    object
 2   District                 541 non-null    object
 3   Quarter                  541 non-null    object
 4   Street_name              541 non-null    object
dtypes: int64(1), object(4)
memory usage: 21.3+ KB


#### Join two data frame (*left join*) with prepared data and LOR dataset

In [69]:
df_merge = pd.merge(df, df_region, how='left', on='Berlin_code_area')

#### Inspection DataFrame after merging two datasets 

In [70]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-09-12,12,9,2022,2022-09-10,14,2022-09-10,18,1100309,600,No,Men,Theft,Aggravated theft,Mitte,Zentrum,Alexanderplatz,Oranienburger Straße
1,2022-09-12,12,9,2022,2022-09-11,17,2022-09-12,6,4300517,1068,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg Zentrum,Mierendorffplatz,Tegeler Weg
2,2022-09-12,12,9,2022,2022-09-10,18,2022-09-12,8,3300411,300,No,Men,Theft,Aggravated theft,Pankow,Nördliches Weißensee,Karow,Karow Nord
3,2022-09-12,12,9,2022,2022-09-11,20,2022-09-12,7,9100408,837,No,Men,Theft,Aggravated theft,Treptow-Köpenick,Treptow-Köpenick 1,Johannisthal,Johannisthal Ost
4,2022-09-12,12,9,2022,2022-09-11,17,2022-09-11,23,1100206,3289,No,Women,Theft,Aggravated theft,Mitte,Zentrum,Regierungsviertel,Unter den Linden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35020,2021-01-02,2,1,2021,2021-01-02,9,2021-01-02,15,12500930,200,No,Women,Theft,Aggravated theft,Reinickendorf,Wittenau/Borsigwalde,Nord 2 - Waidmannslust/Wittenau/Lübars,Wittenau Mitte
35021,2021-01-01,1,1,2021,2021-01-01,9,2021-01-01,11,4200206,290,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg West,Heerstraße,Kranzallee
35022,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
35023,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


### Cleaning data

#### Duplicates inspection

In [71]:
df_merge.duplicated().value_counts()

False    34962
True        63
dtype: int64

#### Deleting duplicates

In [72]:
 df_merge.drop_duplicates(keep='first', inplace=True)

#### DataFrame inspection after deleting duplicated rows

In [73]:
df_merge.duplicated().value_counts()

False    34962
dtype: int64

#### DataFrame inspection about missing values

In [74]:
df_merge.isna().sum()

Record_date                0
Record_day                 0
Record_month               0
Record_year                0
Stealing_date              0
Stealing_hour              0
Report_stealing_date       0
Report_stealing_hour       0
Berlin_code_area           0
Bike_value                 0
Unsuccesful_attempt        0
Bike_type                  0
Crime_type                 0
Crime_reason               0
Administrative_district    1
District                   1
Quarter                    1
Street_name                1
dtype: int64

#### Inspection about category *Unsuccesfull_attempt*

In [75]:
df_merge['Unsuccesful_attempt'].value_counts()

No         34831
Yes          119
Unknown       12
Name: Unsuccesful_attempt, dtype: int64

#### Filter data only for those where thiefs happen and unhappen (*Unsuccesfull_attempt == No, Unsuccesfull_attempt == Yes*), exclude data with *Unknow* category.

In [76]:
df_merge = df_merge[(df_merge['Unsuccesful_attempt'] == 'No') | (df_merge['Unsuccesful_attempt'] == 'Yes')]

#### Inspection DataFrame after droping record with *Unknown* category

In [77]:
df_merge['Unsuccesful_attempt'].value_counts()

No     34831
Yes      119
Name: Unsuccesful_attempt, dtype: int64

#### Reset index and save changes

In [78]:
df_merge.reset_index(drop=True, inplace=True)

#### Dispay DataFrame

In [79]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-09-12,12,9,2022,2022-09-10,14,2022-09-10,18,1100309,600,No,Men,Theft,Aggravated theft,Mitte,Zentrum,Alexanderplatz,Oranienburger Straße
1,2022-09-12,12,9,2022,2022-09-11,17,2022-09-12,6,4300517,1068,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg Zentrum,Mierendorffplatz,Tegeler Weg
2,2022-09-12,12,9,2022,2022-09-10,18,2022-09-12,8,3300411,300,No,Men,Theft,Aggravated theft,Pankow,Nördliches Weißensee,Karow,Karow Nord
3,2022-09-12,12,9,2022,2022-09-11,20,2022-09-12,7,9100408,837,No,Men,Theft,Aggravated theft,Treptow-Köpenick,Treptow-Köpenick 1,Johannisthal,Johannisthal Ost
4,2022-09-12,12,9,2022,2022-09-11,17,2022-09-11,23,1100206,3289,No,Women,Theft,Aggravated theft,Mitte,Zentrum,Regierungsviertel,Unter den Linden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34945,2021-01-02,2,1,2021,2021-01-02,9,2021-01-02,15,12500930,200,No,Women,Theft,Aggravated theft,Reinickendorf,Wittenau/Borsigwalde,Nord 2 - Waidmannslust/Wittenau/Lübars,Wittenau Mitte
34946,2021-01-01,1,1,2021,2021-01-01,9,2021-01-01,11,4200206,290,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg West,Heerstraße,Kranzallee
34947,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
34948,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


In [80]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34950 entries, 0 to 34949
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Record_date              34950 non-null  datetime64[ns]
 1   Record_day               34950 non-null  int64         
 2   Record_month             34950 non-null  int64         
 3   Record_year              34950 non-null  int64         
 4   Stealing_date            34950 non-null  datetime64[ns]
 5   Stealing_hour            34950 non-null  int64         
 6   Report_stealing_date     34950 non-null  datetime64[ns]
 7   Report_stealing_hour     34950 non-null  int64         
 8   Berlin_code_area         34950 non-null  int64         
 9   Bike_value               34950 non-null  int64         
 10  Unsuccesful_attempt      34950 non-null  object        
 11  Bike_type                34950 non-null  object        
 12  Crime_type               34950 n

#### Export DataFrame to csv file for future prediction analysis

In [34]:
df_merge.to_csv(r'C:\Users\dell\Desktop\Project\Berlin_bike_theft_prediction\Berlin_bike_thief_file_for_analysis.csv', 
                sep=',', encoding='utf-8', header=True, index=False)