### Import packages

In [1]:
import numpy as np
import pandas as pd
import requests as requests
from io import StringIO
import csv
import os
from datetime import datetime
import pytz
from time import time
import matplotlib.pyplot as plt

### Prepering raw dataset for analysis
#### All information about dataset you can find in [README.MD](https://github.com/MSI17819/Berlin_bike_theft_prediction/blob/main/README.md) file

#### Create empty dataframe

In [2]:
df = pd.DataFrame()

#### Data pulling function from url adress, where we can find data about Berlin bikes thief record from 01/01/2021-up to today.
#### The data are updated everyday or at least once a two/three day, so when we run function the data are expanding on new records.

In [3]:
def data_pulling_function():
    global df
    url = "https://www.internetwache-polizei-berlin.de/vdb/Fahrraddiebstahl.csv"
    response = requests.get(url)
    df_raw = pd.read_csv(StringIO(response.text))
    local_time = datetime.now(pytz.timezone('Europe/Warsaw'))
    df = df.append(df_raw)
    pd.set_option('display.max_columns', None)
    print('Data update - {}'.format(local_time.strftime("%Y-%m-%d %H:%M:%S")))
    return df

data_pulling_function()

Data update - 2022-08-11 21:47:16


  df = df.append(df_raw)


Unnamed: 0,ANGELEGT_AM,TATZEIT_ANFANG_DATUM,TATZEIT_ANFANG_STUNDE,TATZEIT_ENDE_DATUM,TATZEIT_ENDE_STUNDE,LOR,SCHADENSHOEHE,VERSUCH,ART_DES_FAHRRADS,DELIKT,ERFASSUNGSGRUND
0,10.08.2022,10.08.2022,0,10.08.2022,0,3400619,300,Nein,Mountainbike,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
1,10.08.2022,09.08.2022,15,10.08.2022,0,7601544,800,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2,10.08.2022,09.08.2022,15,10.08.2022,6,5100211,212,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
3,10.08.2022,08.08.2022,20,09.08.2022,12,3601449,600,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
4,10.08.2022,09.08.2022,14,09.08.2022,14,3601450,499,Nein,Herrenfahrrad,Fahrraddiebstahl,Einfacher Diebstahl von Fahrrädern
...,...,...,...,...,...,...,...,...,...,...,...
32595,02.01.2021,01.01.2021,12,02.01.2021,15,11300826,377,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
32596,01.01.2021,01.01.2021,5,01.01.2021,8,10400941,2527,Nein,diverse Fahrräder,Keller- und Bodeneinbruch,Sonstiger schwerer Diebstahl in/aus Keller/Bod...
32597,01.01.2021,01.01.2021,17,01.01.2021,18,8401241,888,Nein,Fahrrad,Fahrraddiebstahl,Einfacher Diebstahl von Fahrrädern
32598,01.01.2021,01.01.2021,20,01.01.2021,20,2100104,2800,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern


### Preprocesing raw data
- Change columns name 
-  Merging data category

#### Change columns names

In [4]:
df.columns = ["Record_date", "Stealing_date", "Stealing_hour", "Report_stealing_date",
              "Report_stealing_hour", "Berlin_code_area", "Bike_value", "Unsuccesful_attempt",
              "Bike_type", "Crime_type", "Crime_reason"]

#### Change data category in *Unsuccesful_attempt* for three categories

In [5]:
df["Unsuccesful_attempt"].replace({'Nein' : 'No', 'Ja' : 'Yes', 'Unbekannt' : 'Unknown'}, inplace=True)

#### Merge two category *Fahrrad* and *diverse Fahrrader* from *Bike_type* column as one category *Bike*

In [6]:
df["Bike_type"].replace({"Herrenfahrrad" : "Men", "Damenfahrrad" : "Women", "Fahrrad" : "Bike", 
                         "Mountainbike" : "Mountain", "Kinderfahrrad" : "Child", 
                         "diverse Fahrräder" : "Bike", "Rennrad" : "Racing", "Lastenfahrrad" : "Cargo"}, inplace=True)

#### Change column *Crime_type* vales

In [7]:
df["Crime_type"].replace(to_replace=["Fahrraddiebstahl", "Keller- und Bodeneinbruch"], 
                         value=["Theft", "Break-in"], inplace=True)

#### Change type of category *Crime_reason*
#### Category *aggrevated theft* = theft with break-in, theft with robbery (aggresor has to use force) 

In [8]:
df["Crime_reason"].replace(to_replace=["Sonstiger schwerer Diebstahl von Fahrrädern", 
                                       "Einfacher Diebstahl von Fahrrädern",
                                       "Sonstiger schwerer Diebstahl in/aus Keller/Boden von Fahrrädern", 
                                       "Einfacher Diebstahl aus Keller/Boden von Fahrrädern"],
                           value=["Aggravated theft", "Theft", "Aggravated theft from basement/floor", 
                                  "Theft from basement/floor"], inplace=True)

#### Changing column *Record_date* type from object to datetime format

In [9]:
df['Record_date'] = pd.to_datetime(df['Record_date'], dayfirst= True)

#### Create support column *date2* for next step spliting

In [10]:
df['date2'] = pd.to_datetime(df['Record_date'], dayfirst= True)

#### Create three new columns with: day, month and year

In [11]:
df['Record_day'] = df['date2'].dt.day
                                    
df['Record_month'] = df['date2'].dt.month

df['Record_year'] = df['date2'].dt.year

#### Moving three new columns for secound, third and fourth position in DataFrame

In [12]:
col_to_move = df.pop('Record_day')
col_to_move_1 = df.pop('Record_month')
col_to_move_2 = df.pop('Record_year')

df.insert(1, 'Record_day', col_to_move)
df.insert(2, 'Record_month', col_to_move_1)
df.insert(3, 'Record_year', col_to_move_2)

#### Changing *Stealing_date* type from object to datetime format

In [13]:
df['Stealing_date'] = pd.to_datetime(df['Stealing_date'], dayfirst= True)

#### Changing *Stealing_date* type from object to datetime format

In [14]:
df['Report_stealing_date'] = pd.to_datetime(df['Report_stealing_date'], dayfirst= True)

#### Change format into two columns (*Report_stealing_hour'*, *Stealing_hour*) from object to datetime hour format

In [15]:
df['Stealing_hour'] = pd.to_datetime(df['Stealing_hour'], format='%H').dt.hour

df['Report_stealing_hour'] = pd.to_datetime(df['Report_stealing_hour'],  format='%H').dt.hour 

#### Drop support colum *date2* from DataFrame

In [16]:
df.drop(['date2'], axis=1, inplace=True)

#### Display new DataFrame

In [17]:
df

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason
0,2022-08-10,10,8,2022,2022-08-10,0,2022-08-10,0,3400619,300,No,Mountain,Theft,Aggravated theft
1,2022-08-10,10,8,2022,2022-08-09,15,2022-08-10,0,7601544,800,No,Men,Theft,Aggravated theft
2,2022-08-10,10,8,2022,2022-08-09,15,2022-08-10,6,5100211,212,No,Men,Theft,Aggravated theft
3,2022-08-10,10,8,2022,2022-08-08,20,2022-08-09,12,3601449,600,No,Women,Theft,Aggravated theft
4,2022-08-10,10,8,2022,2022-08-09,14,2022-08-09,14,3601450,499,No,Men,Theft,Theft
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32595,2021-01-02,2,1,2021,2021-01-01,12,2021-01-02,15,11300826,377,No,Women,Theft,Aggravated theft
32596,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor
32597,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft
32598,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft


#### Load csv file with LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit for Berlin) 
#### The dataset was prepared in advance. File contain columns: 
- *Berlin_code_area*
- *Administrative_district*
- *District* 
- *Quater*
- *Street_name* 

#### Load data about LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit

In [18]:
url = r"https://raw.githubusercontent.com/MSI17819/Berlin_bike_theft_prediction/main/Berlin_LOR_code.csv"

df_region = pd.read_csv(url, sep=';', encoding="ISO-8859-2")

df_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Berlin_code_area         540 non-null    int64 
 1   Administrative_district  540 non-null    object
 2   District                 540 non-null    object
 3   Quarter                  540 non-null    object
 4   Street_name              540 non-null    object
dtypes: int64(1), object(4)
memory usage: 21.2+ KB


#### Join two data frame (*left join*) with prepared data and LOR dataset

In [19]:
df_merge = pd.merge(df, df_region, how='left', on='Berlin_code_area')

#### Inspection DataFrame after merging two datasets 

In [20]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-08-10,10,8,2022,2022-08-10,0,2022-08-10,0,3400619,300,No,Mountain,Theft,Aggravated theft,Pankow,Südliches Pankow,Schönholz/Wilhelmsruh/Rosenthal,Wilhelmsruh
1,2022-08-10,10,8,2022,2022-08-09,15,2022-08-10,0,7601544,800,No,Men,Theft,Aggravated theft,Tempelhof-Schöneberg,Marienfelde / Lichtenrade,Lichtenrade Süd,John-Locke-Straße
2,2022-08-10,10,8,2022,2022-08-09,15,2022-08-10,6,5100211,212,No,Men,Theft,Aggravated theft,Spandau,Spandau Mitte / Nord,Falkenhagener Feld,An der Kappe
3,2022-08-10,10,8,2022,2022-08-08,20,2022-08-09,12,3601449,600,No,Women,Theft,Aggravated theft,Pankow,Nördlicher Prenzlauer Berg,Prenzlauer Berg Ost,Michelangelostraße
4,2022-08-10,10,8,2022,2022-08-09,14,2022-08-09,14,3601450,499,No,Men,Theft,Theft,Pankow,Nördlicher Prenzlauer Berg,Prenzlauer Berg Ost,Volkspark Prenzlauer Berg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32595,2021-01-02,2,1,2021,2021-01-01,12,2021-01-02,15,11300826,377,No,Women,Theft,Aggravated theft,Lichtenberg,Lichtenberg Nord,Frankfurter Allee Süd,Frankfurter Allee Süd
32596,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße
32597,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
32598,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz


### Cleaning data

#### Duplicates inspection

In [21]:
df_merge.duplicated().value_counts()

False    32543
True        57
dtype: int64

#### Deleting duplicates

In [22]:
 df_merge.drop_duplicates(keep='first', inplace=True)

#### DataFrame inspection after deleting duplicated rows

In [23]:
df_merge.duplicated().value_counts()

False    32543
dtype: int64

#### DataFrame inspection about missing values

In [24]:
df_merge.isna().sum()

Record_date                0
Record_day                 0
Record_month               0
Record_year                0
Stealing_date              0
Stealing_hour              0
Report_stealing_date       0
Report_stealing_hour       0
Berlin_code_area           0
Bike_value                 0
Unsuccesful_attempt        0
Bike_type                  0
Crime_type                 0
Crime_reason               0
Administrative_district    0
District                   0
Quarter                    0
Street_name                0
dtype: int64

#### Inspection about category *Unsuccesfull_attempt*

In [25]:
df_merge['Unsuccesful_attempt'].value_counts()

No         32425
Yes          112
Unknown        6
Name: Unsuccesful_attempt, dtype: int64

#### Filter data only for those where thiefs happen and unhappen (*Unsuccesfull_attempt == No, Unsuccesfull_attempt == Yes*), exclude data with *Unknow* category.

In [26]:
df_merge = df_merge[(df_merge['Unsuccesful_attempt'] == 'No') | (df_merge['Unsuccesful_attempt'] == 'Yes')]

#### Inspection DataFrame after droping record with *Unknown* category

In [27]:
df_merge['Unsuccesful_attempt'].value_counts()

No     32425
Yes      112
Name: Unsuccesful_attempt, dtype: int64

#### Reset index and save changes

In [28]:
df_merge.reset_index(drop=True, inplace=True)

#### Dispay DataFrame

In [29]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-08-10,10,8,2022,2022-08-10,0,2022-08-10,0,3400619,300,No,Mountain,Theft,Aggravated theft,Pankow,Südliches Pankow,Schönholz/Wilhelmsruh/Rosenthal,Wilhelmsruh
1,2022-08-10,10,8,2022,2022-08-09,15,2022-08-10,0,7601544,800,No,Men,Theft,Aggravated theft,Tempelhof-Schöneberg,Marienfelde / Lichtenrade,Lichtenrade Süd,John-Locke-Straße
2,2022-08-10,10,8,2022,2022-08-09,15,2022-08-10,6,5100211,212,No,Men,Theft,Aggravated theft,Spandau,Spandau Mitte / Nord,Falkenhagener Feld,An der Kappe
3,2022-08-10,10,8,2022,2022-08-08,20,2022-08-09,12,3601449,600,No,Women,Theft,Aggravated theft,Pankow,Nördlicher Prenzlauer Berg,Prenzlauer Berg Ost,Michelangelostraße
4,2022-08-10,10,8,2022,2022-08-09,14,2022-08-09,14,3601450,499,No,Men,Theft,Theft,Pankow,Nördlicher Prenzlauer Berg,Prenzlauer Berg Ost,Volkspark Prenzlauer Berg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,2021-01-02,2,1,2021,2021-01-01,12,2021-01-02,15,11300826,377,No,Women,Theft,Aggravated theft,Lichtenberg,Lichtenberg Nord,Frankfurter Allee Süd,Frankfurter Allee Süd
32533,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße
32534,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
32535,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz


In [30]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32537 entries, 0 to 32536
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Record_date              32537 non-null  datetime64[ns]
 1   Record_day               32537 non-null  int64         
 2   Record_month             32537 non-null  int64         
 3   Record_year              32537 non-null  int64         
 4   Stealing_date            32537 non-null  datetime64[ns]
 5   Stealing_hour            32537 non-null  int64         
 6   Report_stealing_date     32537 non-null  datetime64[ns]
 7   Report_stealing_hour     32537 non-null  int64         
 8   Berlin_code_area         32537 non-null  int64         
 9   Bike_value               32537 non-null  int64         
 10  Unsuccesful_attempt      32537 non-null  object        
 11  Bike_type                32537 non-null  object        
 12  Crime_type               32537 n

#### Export DataFrame to csv file for future prediction analysis

In [31]:
df_merge.to_csv(r'C:\Users\dell\Desktop\Project\Berlin_bike_theft_prediction\Berlin_bike_thief_file_for_analysis.csv', 
                sep=',', encoding='utf-8', header=True, index=False)