### Import packages

In [1]:
import numpy as np
import pandas as pd
import requests as requests
from io import StringIO
import csv
import os
from datetime import datetime
import pytz
from time import time
import matplotlib.pyplot as plt

### Prepering raw dataset for analysis
#### All information about dataset you can find in [README.MD](https://github.com/MSI17819/Berlin_bike_theft_prediction/blob/main/README.md) file

#### Create empty dataframe

In [2]:
df = pd.DataFrame()

#### Data pulling function from url adress, where we can find data about Berlin bikes thief record from 01/01/2021-up to today.
#### The data are updated everyday or at least once a two/three day, so when we run function the data are expanding on new records.

In [3]:
def data_pulling_function():
    global df
    url = "https://www.internetwache-polizei-berlin.de/vdb/Fahrraddiebstahl.csv"
    response = requests.get(url)
    df_raw = pd.read_csv(StringIO(response.text))
    local_time = datetime.now(pytz.timezone('Europe/Warsaw'))
    df = df.append(df_raw)
    pd.set_option('display.max_columns', None)
    print('Data update - {}'.format(local_time.strftime("%Y-%m-%d %H:%M:%S")))
    return df

data_pulling_function()

Data update - 2022-08-26 14:51:49


  df = df.append(df_raw)


Unnamed: 0,ANGELEGT_AM,TATZEIT_ANFANG_DATUM,TATZEIT_ANFANG_STUNDE,TATZEIT_ENDE_DATUM,TATZEIT_ENDE_STUNDE,LOR,SCHADENSHOEHE,VERSUCH,ART_DES_FAHRRADS,DELIKT,ERFASSUNGSGRUND
0,25.08.2022,25.08.2022,2,25.08.2022,6,2500834,621,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
1,25.08.2022,24.08.2022,18,25.08.2022,8,7601237,140,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
2,25.08.2022,22.08.2022,20,24.08.2022,9,2500729,900,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
3,25.08.2022,25.08.2022,8,25.08.2022,14,6400737,500,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
4,25.08.2022,24.08.2022,17,24.08.2022,17,4300623,668,Nein,Damenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
...,...,...,...,...,...,...,...,...,...,...,...
33726,02.01.2021,02.01.2021,13,02.01.2021,13,8401243,1700,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
33727,01.01.2021,01.01.2021,20,01.01.2021,20,2100104,2800,Nein,Herrenfahrrad,Fahrraddiebstahl,Sonstiger schwerer Diebstahl von Fahrrädern
33728,01.01.2021,01.01.2021,17,01.01.2021,18,8401241,888,Nein,Fahrrad,Fahrraddiebstahl,Einfacher Diebstahl von Fahrrädern
33729,01.01.2021,01.01.2021,5,01.01.2021,8,10400941,2527,Nein,diverse Fahrräder,Keller- und Bodeneinbruch,Sonstiger schwerer Diebstahl in/aus Keller/Bod...


### Preprocesing raw data
- Change columns name 
-  Merging data category

#### Change columns names

In [4]:
df.columns = ["Record_date", "Stealing_date", "Stealing_hour", "Report_stealing_date",
              "Report_stealing_hour", "Berlin_code_area", "Bike_value", "Unsuccesful_attempt",
              "Bike_type", "Crime_type", "Crime_reason"]

#### Change data category in *Unsuccesful_attempt* for three categories

In [5]:
df["Unsuccesful_attempt"].replace({'Nein' : 'No', 'Ja' : 'Yes', 'Unbekannt' : 'Unknown'}, inplace=True)

#### Merge two category *Fahrrad* and *diverse Fahrrader* from *Bike_type* column as one category *Bike*

In [6]:
df["Bike_type"].replace({"Herrenfahrrad" : "Men", "Damenfahrrad" : "Women", "Fahrrad" : "Bike", 
                         "Mountainbike" : "Mountain", "Kinderfahrrad" : "Child", 
                         "diverse Fahrräder" : "Bike", "Rennrad" : "Racing", "Lastenfahrrad" : "Cargo"}, inplace=True)

#### Change column *Crime_type* vales

In [7]:
df["Crime_type"].replace(to_replace=["Fahrraddiebstahl", "Keller- und Bodeneinbruch"], 
                         value=["Theft", "Break-in"], inplace=True)

#### Change type of category *Crime_reason*
#### Category *aggrevated theft* = theft with break-in, theft with robbery (aggresor has to use force) 

In [8]:
df["Crime_reason"].replace(to_replace=["Sonstiger schwerer Diebstahl von Fahrrädern", 
                                       "Einfacher Diebstahl von Fahrrädern",
                                       "Sonstiger schwerer Diebstahl in/aus Keller/Boden von Fahrrädern", 
                                       "Einfacher Diebstahl aus Keller/Boden von Fahrrädern"],
                           value=["Aggravated theft", "Theft", "Aggravated theft from basement/floor", 
                                  "Theft from basement/floor"], inplace=True)

#### Changing column *Record_date* type from object to datetime format

In [9]:
df['Record_date'] = pd.to_datetime(df['Record_date'], dayfirst= True)

#### Create support column *date2* for next step spliting

In [10]:
df['date2'] = pd.to_datetime(df['Record_date'], dayfirst= True)

#### Create three new columns with: day, month and year

In [11]:
df['Record_day'] = df['date2'].dt.day
                                    
df['Record_month'] = df['date2'].dt.month

df['Record_year'] = df['date2'].dt.year

#### Moving three new columns for secound, third and fourth position in DataFrame

In [12]:
col_to_move = df.pop('Record_day')
col_to_move_1 = df.pop('Record_month')
col_to_move_2 = df.pop('Record_year')

df.insert(1, 'Record_day', col_to_move)
df.insert(2, 'Record_month', col_to_move_1)
df.insert(3, 'Record_year', col_to_move_2)

#### Changing *Stealing_date* type from object to datetime format

In [13]:
df['Stealing_date'] = pd.to_datetime(df['Stealing_date'], dayfirst= True)

#### Changing *Stealing_date* type from object to datetime format

In [14]:
df['Report_stealing_date'] = pd.to_datetime(df['Report_stealing_date'], dayfirst= True)

#### Change format into two columns (*Report_stealing_hour'*, *Stealing_hour*) from object to datetime hour format

In [15]:
df['Stealing_hour'] = pd.to_datetime(df['Stealing_hour'], format='%H').dt.hour

df['Report_stealing_hour'] = pd.to_datetime(df['Report_stealing_hour'],  format='%H').dt.hour 

#### Drop support colum *date2* from DataFrame

In [16]:
df.drop(['date2'], axis=1, inplace=True)

#### Display new DataFrame

In [17]:
df

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason
0,2022-08-25,25,8,2022,2022-08-25,2,2022-08-25,6,2500834,621,No,Men,Theft,Aggravated theft
1,2022-08-25,25,8,2022,2022-08-24,18,2022-08-25,8,7601237,140,No,Women,Theft,Aggravated theft
2,2022-08-25,25,8,2022,2022-08-22,20,2022-08-24,9,2500729,900,No,Women,Theft,Aggravated theft
3,2022-08-25,25,8,2022,2022-08-25,8,2022-08-25,14,6400737,500,No,Women,Theft,Aggravated theft
4,2022-08-25,25,8,2022,2022-08-24,17,2022-08-24,17,4300623,668,No,Women,Theft,Aggravated theft
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33726,2021-01-02,2,1,2021,2021-01-02,13,2021-01-02,13,8401243,1700,No,Men,Theft,Aggravated theft
33727,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft
33728,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft
33729,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor


#### Load csv file with LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit for Berlin) 
#### The dataset was prepared in advance. File contain columns: 
- *Berlin_code_area*
- *Administrative_district*
- *District* 
- *Quater*
- *Street_name* 

#### Load data about LOR (Lebensweltlich orientierte Räume (LOR) = Spatial Planning Unit

In [18]:
url = r"https://raw.githubusercontent.com/MSI17819/Berlin_bike_theft_prediction/main/Berlin_LOR_code.csv"

df_region = pd.read_csv(url, sep=';', encoding="ISO-8859-2")

df_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Berlin_code_area         540 non-null    int64 
 1   Administrative_district  540 non-null    object
 2   District                 540 non-null    object
 3   Quarter                  540 non-null    object
 4   Street_name              540 non-null    object
dtypes: int64(1), object(4)
memory usage: 21.2+ KB


#### Join two data frame (*left join*) with prepared data and LOR dataset

In [19]:
df_merge = pd.merge(df, df_region, how='left', on='Berlin_code_area')

#### Inspection DataFrame after merging two datasets 

In [20]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-08-25,25,8,2022,2022-08-25,2,2022-08-25,6,2500834,621,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Friedrichshain Ost,Frankfurter Allee Süd FK,Revaler Straße
1,2022-08-25,25,8,2022,2022-08-24,18,2022-08-25,8,7601237,140,No,Women,Theft,Aggravated theft,Tempelhof-Schöneberg,Marienfelde / Lichtenrade,Marienfelde Nord,Kirchstraße
2,2022-08-25,25,8,2022,2022-08-22,20,2022-08-24,9,2500729,900,No,Women,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Friedrichshain Ost,Frankfurter Allee Nord,Pettenkofer Straße
3,2022-08-25,25,8,2022,2022-08-25,8,2022-08-25,14,6400737,500,No,Women,Theft,Aggravated theft,Steglitz-Zehlendorf,Zehlendorf Nord/Wannsee,Zehlendorf Südwest,Nikolassee
4,2022-08-25,25,8,2022,2022-08-24,17,2022-08-24,17,4300623,668,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg Zentrum,Otto-Suhr-Allee/Kantstraße,Karl-August-Platz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33726,2021-01-02,2,1,2021,2021-01-02,13,2021-01-02,13,8401243,1700,No,Men,Theft,Aggravated theft,Neukölln,Buckow Nord/Rudow,Rudow,Alt-Rudow
33727,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz
33728,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
33729,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


### Cleaning data

#### Duplicates inspection

In [21]:
df_merge.duplicated().value_counts()

False    33670
True        61
dtype: int64

#### Deleting duplicates

In [22]:
 df_merge.drop_duplicates(keep='first', inplace=True)

#### DataFrame inspection after deleting duplicated rows

In [23]:
df_merge.duplicated().value_counts()

False    33670
dtype: int64

#### DataFrame inspection about missing values

In [24]:
df_merge.isna().sum()

Record_date                0
Record_day                 0
Record_month               0
Record_year                0
Stealing_date              0
Stealing_hour              0
Report_stealing_date       0
Report_stealing_hour       0
Berlin_code_area           0
Bike_value                 0
Unsuccesful_attempt        0
Bike_type                  0
Crime_type                 0
Crime_reason               0
Administrative_district    1
District                   1
Quarter                    1
Street_name                1
dtype: int64

#### Inspection about category *Unsuccesfull_attempt*

In [25]:
df_merge['Unsuccesful_attempt'].value_counts()

No         33546
Yes          114
Unknown       10
Name: Unsuccesful_attempt, dtype: int64

#### Filter data only for those where thiefs happen and unhappen (*Unsuccesfull_attempt == No, Unsuccesfull_attempt == Yes*), exclude data with *Unknow* category.

In [26]:
df_merge = df_merge[(df_merge['Unsuccesful_attempt'] == 'No') | (df_merge['Unsuccesful_attempt'] == 'Yes')]

#### Inspection DataFrame after droping record with *Unknown* category

In [27]:
df_merge['Unsuccesful_attempt'].value_counts()

No     33546
Yes      114
Name: Unsuccesful_attempt, dtype: int64

#### Reset index and save changes

In [28]:
df_merge.reset_index(drop=True, inplace=True)

#### Dispay DataFrame

In [29]:
df_merge

Unnamed: 0,Record_date,Record_day,Record_month,Record_year,Stealing_date,Stealing_hour,Report_stealing_date,Report_stealing_hour,Berlin_code_area,Bike_value,Unsuccesful_attempt,Bike_type,Crime_type,Crime_reason,Administrative_district,District,Quarter,Street_name
0,2022-08-25,25,8,2022,2022-08-25,2,2022-08-25,6,2500834,621,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Friedrichshain Ost,Frankfurter Allee Süd FK,Revaler Straße
1,2022-08-25,25,8,2022,2022-08-24,18,2022-08-25,8,7601237,140,No,Women,Theft,Aggravated theft,Tempelhof-Schöneberg,Marienfelde / Lichtenrade,Marienfelde Nord,Kirchstraße
2,2022-08-25,25,8,2022,2022-08-22,20,2022-08-24,9,2500729,900,No,Women,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Friedrichshain Ost,Frankfurter Allee Nord,Pettenkofer Straße
3,2022-08-25,25,8,2022,2022-08-25,8,2022-08-25,14,6400737,500,No,Women,Theft,Aggravated theft,Steglitz-Zehlendorf,Zehlendorf Nord/Wannsee,Zehlendorf Südwest,Nikolassee
4,2022-08-25,25,8,2022,2022-08-24,17,2022-08-24,17,4300623,668,No,Women,Theft,Aggravated theft,Charlottenburg-Wilmersdorf,Charlottenburg Zentrum,Otto-Suhr-Allee/Kantstraße,Karl-August-Platz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33655,2021-01-02,2,1,2021,2021-01-02,13,2021-01-02,13,8401243,1700,No,Men,Theft,Aggravated theft,Neukölln,Buckow Nord/Rudow,Rudow,Alt-Rudow
33656,2021-01-01,1,1,2021,2021-01-01,20,2021-01-01,20,2100104,2800,No,Men,Theft,Aggravated theft,Friedrichshain-Kreuzberg,Kreuzberg Nord,Südliche Friedrichstadt,Moritzplatz
33657,2021-01-01,1,1,2021,2021-01-01,17,2021-01-01,18,8401241,888,No,Bike,Theft,Theft,Neukölln,Buckow Nord/Rudow,Rudow,Blumenviertel
33658,2021-01-01,1,1,2021,2021-01-01,5,2021-01-01,8,10400941,2527,No,Bike,Break-in,Aggravated theft from basement/floor,Marzahn-Hellersdorf,Kaulsdorf/Mahlsdorf,Mahlsdorf,Pilgramer Straße


In [30]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33660 entries, 0 to 33659
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Record_date              33660 non-null  datetime64[ns]
 1   Record_day               33660 non-null  int64         
 2   Record_month             33660 non-null  int64         
 3   Record_year              33660 non-null  int64         
 4   Stealing_date            33660 non-null  datetime64[ns]
 5   Stealing_hour            33660 non-null  int64         
 6   Report_stealing_date     33660 non-null  datetime64[ns]
 7   Report_stealing_hour     33660 non-null  int64         
 8   Berlin_code_area         33660 non-null  int64         
 9   Bike_value               33660 non-null  int64         
 10  Unsuccesful_attempt      33660 non-null  object        
 11  Bike_type                33660 non-null  object        
 12  Crime_type               33660 n

#### Export DataFrame to csv file for future prediction analysis

In [31]:
df_merge.to_csv(r'C:\Users\dell\Desktop\Project\Berlin_bike_theft_prediction\Berlin_bike_thief_file_for_analysis.csv', 
                sep=',', encoding='utf-8', header=True, index=False)