In [1]:
# Import Libraries 
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import time
import researchpy as rp

from sklearn import metrics, preprocessing, linear_model, model_selection
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from scipy import stats
from tqdm import tqdm

In [2]:
#Lese CSV ein und caste Dates zu datetime64
def open_train():
    dataset = pd.read_csv('train.csv')
    dataset['Dates'] = pd.to_datetime(dataset['Dates']) 
    return dataset

def open_test():
    dataset = pd.read_csv('test.csv')
    dataset['Dates'] = pd.to_datetime(dataset['Dates']) 
    return dataset

In [46]:
def preprocessing_data():
    train_data = open_train()
    test_data = open_test()
    
    #Uppercase
    train_data['DayOfWeek'] = train_data['DayOfWeek'].str.upper()
    train_data['Address'] = train_data['Address'].str.upper()
    test_data['DayOfWeek'] = train_data['DayOfWeek'].str.upper()
    test_data['Address'] = train_data['Address'].str.upper()
    
    #Show amount of Data
    print('Columns Trainset: ')
    print(train_data.shape)
    print('Columns Testset: ')
    print(test_data.shape)
    
    #Drop Duplicates
    train_data = train_data.drop_duplicates()
    print('Columns after Duplicates Trainset: ')
    
    #Outliers Elimination
    train_data = train_data[(np.abs(stats.zscore(train_data['X'])) < 3)]
    train_data = train_data[(np.abs(stats.zscore(train_data['Y'])) < 3)]
    print('Columns after after Z-Score Analysis Trainset: ')
    print(train_data.shape)

    #Split Data
    for x in [train_data, test_data]:
        x['years'] = x['Dates'].dt.year
        x['months'] = x['Dates'].dt.month
        x['days'] = x['Dates'].dt.day
        x['hours'] = x['Dates'].dt.hour
        x['minutes'] = x['Dates'].dt.minute
        x['seconds'] = x['Dates'].dt.second
        
    #Drop Dates
    train_data = train_data.drop(['Dates'], axis = 1)
    test_data = test_data.drop(['Dates'], axis = 1)
    
    #Drop Attributes 
    train_data = train_data.drop(['Address','Resolution','Descript', 'minutes', 'seconds'], axis = 1)
    test_data = test_data.drop(['Address', 'minutes', 'seconds'], axis = 1)
    
    # Alle Leerzeichen löschen
    train_data.columns = train_data.columns.str.replace(' ', '')
    test_data.columns = test_data.columns.str.replace(' ', '')
    
     #Transformation
    train_data['Category'] = LabelEncoder().fit_transform(train_data.Category)
    feature_cols =['DayOfWeek', 'PdDistrict']
    
    #Transform Data into binary 
    train_data = pd.get_dummies(train_data, columns=feature_cols)
    test_data = pd.get_dummies(test_data, columns=feature_cols)
    
    # Missing Values Kontrolle
    ms1 = train_data.isnull().any()
    ms2 = train_data.isnull().any()
    print(ms1)
    print(ms2)
    
    #Make Feature Columns
    feature_cols = [x for x in train_data if x!='Category']
    
    X = train_data[feature_cols]
    y = train_data['Category']
    
    x_train, x_test,y_train, y_test = model_selection.train_test_split(X, y, train_size=.80)
    
    return x_train, x_test,y_train, y_test, test_data

In [47]:
x_train, x_test,y_train, y_test, test_data = preprocessing_data()

Columns Trainset: 
(878049, 9)
Columns Testset: 
(884262, 7)
Columns after Duplicates Trainset: 
Columns after after Z-Score Analysis Trainset: 
(875659, 9)
Category                 False
X                        False
Y                        False
years                    False
months                   False
days                     False
hours                    False
DayOfWeek_FRIDAY         False
DayOfWeek_MONDAY         False
DayOfWeek_SATURDAY       False
DayOfWeek_SUNDAY         False
DayOfWeek_THURSDAY       False
DayOfWeek_TUESDAY        False
DayOfWeek_WEDNESDAY      False
PdDistrict_BAYVIEW       False
PdDistrict_CENTRAL       False
PdDistrict_INGLESIDE     False
PdDistrict_MISSION       False
PdDistrict_NORTHERN      False
PdDistrict_PARK          False
PdDistrict_RICHMOND      False
PdDistrict_SOUTHERN      False
PdDistrict_TARAVAL       False
PdDistrict_TENDERLOIN    False
dtype: bool
Category                 False
X                        False
Y                        

In [48]:
x_train.to_pickle('x_train')
x_test.to_pickle('x_test')
y_train.to_pickle('y_train')
y_test.to_pickle('y_test')
test_data.to_pickle('test_data')

In [51]:
#Check Mappting of Transformation
train_data = open_train()
le = preprocessing.LabelEncoder()
train_data['Category'] = le.fit_transform(train_data.Category)
le.classes_

array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)