# Data Preprocessing and Visualization

## Importing the libraries and the dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv('crimes.csv',sep = ';')
dataset.head()

## Statistical description

In [None]:
dataset.shape

In [None]:
dataset.describe()

## Boxplot

In [None]:
import seaborn as sns
sns.set(style='whitegrid')

In [None]:
sns.boxplot(data = dataset)
plt.show()

## Scatterpolt only quantitative variables

In [None]:
sns.set(style='ticks')

In [None]:
sns.pairplot(dataset)
plt.show()

## Correlation 

In [None]:
correlations = dataset.corr(method='pearson')

In [None]:
f, ax = plt.subplots(figsize = (10, 10))
sns.heatmap(correlations, annot = True)
plt.show()

## Independent variables and labels

In [None]:
X = dataset.iloc[:,1:8].values
labels = dataset.iloc[:,0].values
print(labels)

## Principal Component Analysis et Standarization

In [None]:
from IPython.display import display, Image
Image(filename='pca.png')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaler)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
print(sum(pca.explained_variance_ratio_[:2]))

In [None]:
X_pca.shape

## Visualization 

In [None]:
plt.scatter(X_pca[:,0],X_pca[:,1])
for label,x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label,xy=(x,y))

plt.xlabel('PCA_1')
plt.ylabel('PCA_2')
plt.show()

# KMeans Clustering

## KMeans Intuition

In [None]:
Image(filename='kmeans1.png')

In [None]:
Image(filename='kmeans2.png')

In [None]:
Image(filename='kmeans3.png')

In [None]:
Image(filename='kmeans4.png')

In [None]:
Image(filename='kmeans5.png')

In [None]:
Image(filename='kmeans6.png')

In [None]:
Image(filename='kmeans7.png')

## Make KMeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
wcss=[]
for k in range(1,11):
    kmeans=KMeans(n_clusters=k,init='k-means++',random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,11), wcss, 'bx-')
plt.title('The Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
from kneed import KneeLocator
k = KneeLocator(range(1, 11), wcss, curve="convex", direction="decreasing")
k.elbow

In [None]:
k.plot_knee()
plt.show()

In [None]:
kmeans=KMeans(n_clusters=4,init='k-means++',random_state=0)
kmeans.fit(X)
y_kmeans = kmeans.labels_

In [None]:
y_kmeans

## Clusters Visualization

In [None]:
plt.scatter(X_pca[y_kmeans==0,0],X_pca[y_kmeans==0,1],c='red',label='Cluster 0')
plt.scatter(X_pca[y_kmeans==1,0],X_pca[y_kmeans==1,1],c='blue',label='Cluster 1')
plt.scatter(X_pca[y_kmeans==2,0],X_pca[y_kmeans==2,1],c='green',label='Cluster 2')
plt.scatter(X_pca[y_kmeans==3,0],X_pca[y_kmeans==3,1],c='yellow',label='Cluster 3')
for label,x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label,xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clustering')
plt.legend()
plt.show()

In [None]:
kmeans2=KMeans(n_clusters=4,init='k-means++',random_state=0)
kmeans2.fit(X_scaler)
y_kmeans_sc = kmeans2.labels_

In [None]:
y_kmeans_sc

In [None]:
plt.scatter(X_pca[y_kmeans_sc==0,0],X_pca[y_kmeans_sc==0,1],c='red',label='Cluster 0')
plt.scatter(X_pca[y_kmeans_sc==1,0],X_pca[y_kmeans_sc==1,1],c='blue',label='Cluster 1')
plt.scatter(X_pca[y_kmeans_sc==2,0],X_pca[y_kmeans_sc==2,1],c='green',label='Cluster 2')
plt.scatter(X_pca[y_kmeans_sc==3,0],X_pca[y_kmeans_sc==3,1],c='yellow',label='Cluster 3')
for label,x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label,xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clustering')
plt.legend()
plt.show()

In [None]:
kmeans3=KMeans(n_clusters=4,init='k-means++',random_state=0)
kmeans3.fit(X_pca)
y_kmeans_pca = kmeans3.labels_

In [None]:
y_kmeans_pca

In [None]:
plt.scatter(X_pca[y_kmeans_pca==0,0],X_pca[y_kmeans_pca==0,1],c='red',label='Cluster 0')
plt.scatter(X_pca[y_kmeans_pca==1,0],X_pca[y_kmeans_pca==1,1],c='blue',label='Cluster 1')
plt.scatter(X_pca[y_kmeans_pca==2,0],X_pca[y_kmeans_pca==2,1],c='green',label='Cluster 2')
plt.scatter(X_pca[y_kmeans_pca==3,0],X_pca[y_kmeans_pca==3,1],c='yellow',label='Cluster 3')
for label,x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label,xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clustering')
plt.legend()
plt.show()

## Profiling

In [None]:
kmeans2.cluster_centers_.shape

In [None]:
dataset.columns

In [None]:
col_names = ['Meutre', 'Rapt', 'Vol', 'Attaque', 'Viol', 'Larcin','Auto_Theft']

In [None]:
cluster_centers = pd.DataFrame(data = kmeans2.cluster_centers_, columns = col_names)
cluster_centers

In [None]:
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = col_names)
cluster_centers

In [None]:
dataset.describe()

In [None]:
dataset_cluster = pd.concat([dataset, pd.DataFrame({'Cluster': y_kmeans_sc})], axis=1)
dataset_cluster.head()

In [None]:
Xc = dataset_cluster.iloc[:,1:8].values
yc = dataset_cluster.iloc[:,8].values

In [None]:
import graphviz
from sklearn import tree
from sklearn.tree import export_graphviz, DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth = 5)
model.fit(Xc,yc)

In [None]:
tree.export_graphviz(model,feature_names = col_names,\
                    out_file = 'tree.dot',\
                    label = 'all',\
                    filled = True,\
                    rounded = True)

In [None]:
Image('tree.png')

# Agglomerative Hierarchical Clustering 

## HC Intuition 

In [None]:
Image(filename='hc1.png')

In [None]:
Image(filename='hc2.png')

In [None]:
Image(filename='hc3.png')

In [None]:
Image(filename='hc4.png')

In [None]:
Image(filename='hc5.png')

In [None]:
Image(filename='hc6.png')

In [None]:
Image(filename='hc7.png')

In [None]:
Image(filename='hc8.png')

In [None]:
Image(filename='hc9.png')

In [None]:
Image(filename='hc10.png')

In [None]:
Image(filename='hc11.png')

In [None]:
Image(filename='hc12.png')

In [None]:
Image(filename='hc13.png')

In [None]:
Image(filename='hc14.png')

In [None]:
Image(filename='hc15.png')

In [None]:
Image(filename='hc16.png')

## Make HC

In [None]:
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X_pca,method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')

In [None]:
from sklearn.cluster import AgglomerativeClustering 
hc = AgglomerativeClustering(n_clusters=2,affinity='euclidean',linkage='ward')
hc.fit(X_scaler)
y_hc = hc.labels_

In [None]:
y_hc

In [None]:
plt.scatter(X_pca[y_hc==0,0],X_pca[y_hc==0,1],c='red',label='Cluster 0')
plt.scatter(X_pca[y_hc==1,0],X_pca[y_hc==1,1],c='blue',label='Cluster 1')
for label,x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label,xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Hierachical Clustering')
plt.legend()
plt.show()

## Silhouette score

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
cluster_range = range(2,11)
random_range = range(0,10)

silhouette=[]       

for i in cluster_range:
    for j in random_range:
        cluster=KMeans(n_clusters=i, random_state=j)
        cluster.fit(X_scaler)
        cluster_labels= cluster.labels_
        silhouette_avg= silhouette_score(X_scaler,cluster_labels)
        print("For n_clusters=",i,"and seed=",j, "silhouette avg:", silhouette_avg )
        silhouette.append([i,j,silhouette_avg])

In [None]:
silhouette=pd.DataFrame(silhouette,columns=["n_clusters","seed","silhouette_score"])
silhouette

In [None]:
pivot_silhouette=pd.pivot_table(silhouette,index="n_clusters",columns="seed",values="silhouette_score")
pivot_silhouette

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(pivot_silhouette, annot=True, linewidths=.5, fmt='.3f',cmap=sns.cm.rocket_r )
plt.tight_layout()
plt.show()