## 1. __Crime Classification Model using Pyspark__

## __2. Setup Spark and load other libraries__

In [36]:
import pyspark 
spark = pyspark.sql.SparkSession.builder \
    .appName("Crime_Classification") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
sc = spark.sparkContext

In [37]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
np.random.seed(60)

In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

## __3. Data Extraction__

In [39]:
crime_dataset = pd.read_csv('../train.csv')
crime_dataset

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


In [40]:
crime_dataset['Latlong'] = crime_dataset['X']*crime_dataset['Y']
crime_dataset

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Latlong
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,-4624.588916
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,-4624.588916
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,-4627.691645
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,-4627.847257
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,-4624.699819
...,...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,-4618.426865
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,-4620.177499
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,-4624.432596
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,-4623.988577


In [41]:
crime_dataset.drop(['X','Y','Dates'],axis='columns',inplace=True)
crime_dataset

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,Latlong
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-4624.588916
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-4624.588916
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-4627.691645
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-4627.847257
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-4624.699819
...,...,...,...,...,...,...,...
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-4618.426865
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-4620.177499
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-4624.432596
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-4623.988577


# __Label Encoding__

In [42]:
label_encoder = LabelEncoder()
crime_dataset = crime_dataset[['Category','Descript','DayOfWeek','PdDistrict','Resolution','Address','Latlong']].apply(label_encoder.fit_transform)

new_data = crime_dataset.to_csv('preproccesing_data.csv', index=False)

crime_dataset

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,Latlong
0,37,866,6,4,0,19790,12609
1,21,810,6,4,0,19790,12609
2,21,810,6,4,0,22697,1999
3,16,404,6,4,11,4266,1571
4,16,404,6,5,11,1843,12206
...,...,...,...,...,...,...,...
878044,25,661,1,8,11,15816,29874
878045,16,404,1,2,11,11491,25037
878046,16,404,1,7,11,11315,13199
878047,35,496,1,7,11,22308,14848


In [43]:
from pyspark.sql.functions import col, lower
df = spark.read.format('csv')\
          .option('header','true')\
          .option('inferSchema', 'true')\
          .option('timestamp', 'true')\
          .load('preproccesing_data.csv')


In [44]:
df.columns

['Category',
 'Descript',
 'DayOfWeek',
 'PdDistrict',
 'Resolution',
 'Address',
 'Latlong']

In [45]:
print('Dataframe Structure')
print('----------------------------------')
print(df.printSchema())
print(' ')
print('Dataframe preview')
print(df.show(5))
print(' ')
print('----------------------------------')
print('Total number of rows', df.count())

Dataframe Structure
----------------------------------
root
 |-- Category: integer (nullable = true)
 |-- Descript: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- PdDistrict: integer (nullable = true)
 |-- Resolution: integer (nullable = true)
 |-- Address: integer (nullable = true)
 |-- Latlong: integer (nullable = true)

None
 
Dataframe preview
+--------+--------+---------+----------+----------+-------+-------+
|Category|Descript|DayOfWeek|PdDistrict|Resolution|Address|Latlong|
+--------+--------+---------+----------+----------+-------+-------+
|      37|     866|        6|         4|         0|  19790|  12609|
|      21|     810|        6|         4|         0|  19790|  12609|
|      21|     810|        6|         4|         0|  22697|   1999|
|      16|     404|        6|         4|        11|   4266|   1571|
|      16|     404|        6|         5|        11|   1843|  12206|
+--------+--------+---------+----------+----------+-------+-------+
only showing

In [46]:
def top_n_list(df,name_column, N):
    print("Total number of unique value of"+' '+name_column+''+':'+' '+str(df.select(name_column).distinct().count()))
    print(' ')
    print('Top'+' '+str(N)+' '+'Crime'+' '+name_column)
    df.groupBy(name_column).count().withColumnRenamed('count','totalValue').orderBy(col('totalValue').desc()).show(N)
    
    
top_n_list(df, 'Resolution',12)


Total number of unique value of Resolution: 17
 
Top 12 Crime Resolution
+----------+----------+
|Resolution|totalValue|
+----------+----------+
|        11|    526790|
|         0|    206403|
|         1|     77004|
|        10|     17101|
|        15|     14534|
|        16|      9585|
|         7|      5564|
|         3|      3976|
|         4|      3934|
|        12|      3714|
|         8|      3332|
|        13|      2504|
+----------+----------+
only showing top 12 rows



## __4. Partition the dataset into Training and Test dataset__

In [47]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [48]:
feature_columns = ['Category','Descript','DayOfWeek','PdDistrict','Address','Latlong']

In [49]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

+--------+--------+---------+----------+----------+-------+-------+--------------------+
|Category|Descript|DayOfWeek|PdDistrict|Resolution|Address|Latlong|            features|
+--------+--------+---------+----------+----------+-------+-------+--------------------+
|      37|     866|        6|         4|         0|  19790|  12609|[37.0,866.0,6.0,4...|
|      21|     810|        6|         4|         0|  19790|  12609|[21.0,810.0,6.0,4...|
|      21|     810|        6|         4|         0|  22697|   1999|[21.0,810.0,6.0,4...|
|      16|     404|        6|         4|        11|   4266|   1571|[16.0,404.0,6.0,4...|
|      16|     404|        6|         5|        11|   1843|  12206|[16.0,404.0,6.0,5...|
|      16|     406|        6|         2|        11|   1505|  33885|[16.0,406.0,6.0,2...|
|      36|     739|        6|         2|        11|  13322|  29844|[36.0,739.0,6.0,2...|
|      36|     739|        6|         0|        11|  18054|  33285|[36.0,739.0,6.0,0...|
|      16|     404|  

In [50]:
#scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
#scalerModel = scaler.fit(df)
#training_scaled = scalerModel.transform(df)
#
#training_scaled.show()

+--------+--------+---------+----------+----------+-------+-------+--------------------+--------------------+
|Category|Descript|DayOfWeek|PdDistrict|Resolution|Address|Latlong|            features|      scaledFeatures|
+--------+--------+---------+----------+----------+-------+-------+--------------------+--------------------+
|      37|     866|        6|         4|         0|  19790|  12609|[37.0,866.0,6.0,4...|[3.46161989324852...|
|      21|     810|        6|         4|         0|  19790|  12609|[21.0,810.0,6.0,4...|[1.96470318265456...|
|      21|     810|        6|         4|         0|  22697|   1999|[21.0,810.0,6.0,4...|[1.96470318265456...|
|      16|     404|        6|         4|        11|   4266|   1571|[16.0,404.0,6.0,4...|[1.49691671059395...|
|      16|     404|        6|         5|        11|   1843|  12206|[16.0,404.0,6.0,5...|[1.49691671059395...|
|      16|     406|        6|         2|        11|   1505|  33885|[16.0,406.0,6.0,2...|[1.49691671059395...|
|      36|

In [55]:
(train_data, test_data) = training_scaled.randomSplit([0.7, 0.3])
train_data.show()

+--------+--------+---------+----------+----------+-------+-------+--------------------+--------------------+
|Category|Descript|DayOfWeek|PdDistrict|Resolution|Address|Latlong|            features|      scaledFeatures|
+--------+--------+---------+----------+----------+-------+-------+--------------------+--------------------+
|       0|      32|        0|         1|        11|   4251|   5245|[0.0,32.0,0.0,1.0...|[0.0,0.1393033916...|
|       0|      32|        0|         3|        11|   4184|  17392|[0.0,32.0,0.0,3.0...|[0.0,0.1393033916...|
|       0|      32|        0|         3|        11|   4396|  16569|[0.0,32.0,0.0,3.0...|[0.0,0.1393033916...|
|       0|      32|        0|         3|        11|   8995|  23414|[0.0,32.0,0.0,3.0...|[0.0,0.1393033916...|
|       0|      32|        0|         3|        11|   8995|  23414|[0.0,32.0,0.0,3.0...|[0.0,0.1393033916...|
|       0|      32|        0|         4|        11|   9081|    238|[0.0,32.0,0.0,4.0...|[0.0,0.1393033916...|
|       0|

In [52]:
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='Resolution', maxIter=10)
lr_model = lr.fit(train_data)

In [57]:
predictions = lr_model.transform(test_data)
predictions.select('Resolution', 'prediction').show(10)

+----------+----------+
|Resolution|prediction|
+----------+----------+
|         0|      11.0|
|        11|      11.0|
|        11|      11.0|
|         0|      11.0|
|        11|      11.0|
|         0|      11.0|
|        11|      11.0|
|         0|      11.0|
|         0|      11.0|
|        11|      11.0|
+----------+----------+
only showing top 10 rows



In [59]:
evaluator = MulticlassClassificationEvaluator(labelCol="Resolution").setPredictionCol("prediction").evaluate(predictions)
print(' ')
print('--------------------------Accuracy-----------------------------')
print(' ')
print('               accuracy:{}'.format(evaluator))

 
--------------------------Accuracy-----------------------------
 
               accuracy:0.4875876353953737
