In [None]:
# %lsmagic
# %env
# %time
# %%timeit

### Imports & settings

In [None]:
%matplotlib inline
import sklearn.cluster as cluster
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd
import math
from sklearn.neighbors import KNeighborsClassifier as KNNCLF
from sklearn.neighbors import KNeighborsRegressor as KNNREG
from sklearn.preprocessing import Normalizer as normalizer
from sklearn.model_selection import cross_val_score as cross_val
from sklearn.model_selection import train_test_split as tt_split
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from scipy import stats
from sklearn.feature_selection import RFE

### Data import and NaN removal

In [None]:
Crimes = pd.read_csv('DataScience/crime.csv')
Crimes = Crimes[np.isfinite(Crimes['Police_Districts'])]
Crimes = Crimes[np.isfinite(Crimes['UCR_General'])]
CrimesData = Crimes.as_matrix()
datainit = CrimesData[:,[10,2,5,8,12,13]]

In [None]:
Crimes

### Filter for certain district

In [None]:
# Having : District, Date & Crime type
District = 1;
FilterData = [x for x in CrimesData[:,[10,2,8,12,13]] if x[0] == District]


### Sort by date

In [None]:
from datetime import datetime
data = sorted(datainit, key = lambda row: datetime.strptime(row[1], "%Y-%m-%d %H:%M:%S"))

In [None]:
data = datainit

### Make the date an int

In [None]:
newData = []
for item in data:
    date = ''.join( [c for c in item[1] if c not in ' -:' ] )
    year = date[:4]
    month = date[4:6]
    day = date[6:8]
    hour = item[2]
    minute = date[10:12]
    item[2] = int(item[2])
    #lon = float("{0:.4f}".format(item[4]))
    #lat = float("{0:.4f}".format(item[5]))
    lon = item[4]
    lat = item[5]
    newData.append([item[0],int(year),int(month),int(day),int(hour),int(minute),lon,lat,item[3]])
data = np.array(newData)

In [None]:
np.shape(data)

### Split - Not by random

In [None]:
norm = normalizer()

#X_train, X_test, y_train, y_test = tt_split(data[:,[0,2,3,4,5,6,7]], data[:,8], test_size=0.4, random_state=1)
#X_train_norm = norm.fit_transform(X_train)
#X_test = X_test.flatten()
#y_train_norm = norm.fit_transform(y_train)
#y_test = y_test.flatten()

dLength = np.shape(data)[0]

trainSize = int(dLength*0.80)

X_train = norm.fit_transform(data[:trainSize,[2,3,4,5,6,7]])
y_train = data[:trainSize,8]
X_test = norm.fit_transform(data[trainSize:,[2,3,4,5,6,7]])
y_test = data[trainSize:,8]
len(y_train)

In [None]:
np.shape(X_train)

In [None]:
estimator = MLPClassifier(alpha=1e-5, activation='logistic', hidden_layer_sizes=(5,3), learning_rate_init=0.01, early_stopping=True)
selector = RFE(estimator).fit(X_train, y_train)
selector.ranking_

In [None]:
np.shape(X_train)

### PCA

In [None]:
variance = []
pca = PCA(n_components=6).fit(X_train)
transformed_data = pca.transform(X_train)
variance = pca.explained_variance_ratio_


cumsum_variance = np.cumsum(variance)
axis = range(len(variance))
axis = [x+1 for x in axis]
df = pd.DataFrame(index = axis)
df['% of Variance'] = variance
df['Cumsum % of Variance'] = cumsum_variance
#np.round(df[::20], decimals=3)

df

In [None]:
fig = plt.figure()
plt.title("Describes variance per component")
plt.xlabel('number of components')
plt.ylabel('Described Variance')
plt.plot(axis,cumsum_variance)
fig.savefig('PCA_plot.png')
plt.show()

## KNN Classification

### Find best k

In [None]:
len(y_test)

In [None]:
#rule of thumb: squreroot(37000)=192
#242
#52!!
scores = []
k = range(40,60)
y_real = [x for x in y_test]
for x in k:
    knn = KNNCLF(n_neighbors=x, n_jobs=-1, weights='distance')
    y = [x for x in y_train]
    y_ = knn.fit(X_train,y)
    scores.append(y_.score(X_test, y_real))

In [None]:
fig = plt.figure()
plt.title("Test for optimal K-value")
plt.xlabel('K-values')
plt.ylabel('Precision(%)')
plt.plot(k,scores)
fig.savefig('bestK.png')
plt.show()

### Init KNN with best k

In [None]:
X_predict = X_test
y_real = [x for x in y_test]
X = X_train
y = [x for x in y_train]
knn = KNNCLF(n_neighbors=52, n_jobs=-1, weights='distance')
y_ = knn.fit(X,y)

### Prediction

In [None]:
y_predict = y_.predict(X_predict)
y_predict

### Probability of the predicted value to be true

In [None]:
y_proba_predict = y_.predict_proba(X_predict)
y_proba = [max(x) for x in y_proba_predict]
np.mean(y_proba)

### Score of how good the model is

In [None]:
y_.score(X_predict, y_real)

### Confusion Metric

In [None]:
confusion_matrix(y_real, y_predict)

### Heatmap

In [None]:
heatmap, xedges, yedges = np.histogram2d(y_real, y_predict, bins=50)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

fig = plt.figure()
plt.clf()
plt.title("Real vs Predicted values")
plt.xlabel('Real value(UCR Code)')
plt.ylabel('Predicted value(UCR Code)')
plt.imshow(heatmap, extent=extent)
fig.savefig('HeatMap.png')
plt.show()

### Scatter plot showing the real values and the predicted values

In [None]:
plt.subplot(111)
plt.title("Real & Predicted values")
plt.xlabel('?')
plt.ylabel('UCR Code')
plt.scatter(X_predict, y_real, marker='.', c='r', s=200, label='real')
plt.scatter(X_predict, y_predict, marker=',',c='g', s=100, label='prediction')
plt.axis('tight')
plt.legend()

plt.show()

### 10-fold Cross Validation KNN

In [None]:
knnscores = cross_val(knn, X_train, [x for x in y_train], cv=10)
knnciRaw = stats.norm.interval(0.95, loc=knnscores.mean(), scale=knnscores.std())
knnci = knnciRaw[1]-knnciRaw[0]
knnscoreMean = knnscores.mean()

Knn with k = 52 and use of all parameter except year yields 26.4% +- 4.1%

In [None]:
print('%0.003f +- %0.003f' % (knnscoreMean,knnci))

In [None]:
knnscores.mean()

### Silhuette analysis

In [None]:
range_n_clusters = [13, 14, 15, 16, 17]
silhouette_avg = []

for n_clusters in range_n_clusters:

    clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1)
    cluster_labels = clusterer.fit_predict(data[:,[1,2]])

    silhouette_avg.append(silhouette_score(data[:,[1,2]], cluster_labels))

plt.plot(range_n_clusters,silhouette_avg)
plt.show()

### ANN

In [None]:
np.shape(X_train)

28,37% (5,3)

In [None]:
scores = []
for n in range(1,50):
    clf = MLPClassifier(alpha=1e-5, activation='logistic', hidden_layer_sizes=(n), learning_rate_init=0.01, early_stopping=True)
    clf.fit(X_train, y_train)
    #y_predict = clf.predict(X_test_norm)
    scores.append(clf.score(X_test, y_test))
scores

In [None]:
clf.n_layers_

In [None]:
clf.hidden_layer_sizes

### 10-fold cross validation ANN

In [None]:
clf = MLPClassifier(alpha=1e-5, activation='logistic')
annscores = cross_val(clf, X_train, [x for x in y_train], cv=10)
annciRaw = stats.norm.interval(0.95, loc=annscores.mean(), scale=annscores.std())
annci = annciRaw[1]-annciRaw[0]
annscoreMean = annscores.mean()

In [None]:
annciRaw = ciRaw
annci = ci
annscoreMean = scoreMean

In [None]:
print('%0.003f +- %0.003f' % (annscoreMean,annci))

Ann with use of all parameter except year yields 27.7% +- 3.6%

### Bar plot with error bars

In [None]:
fig, ax = plt.subplots()
knnBar = ax.bar(1, knnscores.mean(), width, color='r', yerr=knnci)
annBar = ax.bar(2, annscores.mean(), width, color='y', yerr=annci)

ax.set_ylabel('True predictions (%)')
ax.set_title('10-fold cross validation for knn and ann')
ax.set_xlabel('knn(left) and ann(right)')

ax.legend((knnBar[0], annBar[0]), ('knn', 'ann'))

fig.savefig('knn and ann bar plot with errors (k = 52) year not in features vector')

plt.show()