In [297]:
# 1 Filtering
import findspark
findspark.init()

import pandas as pd

# read T2
df=pd.read_csv("task3_dataset.csv")

# list the columns
list(df)

# print number of rows and columns 
print (df.shape)

# 1.1 Filter rows
# convert string to datetime
df['TimeStemp'] = pd.to_datetime(df['TimeStemp'])

#Eliminar filas con valores nulos
df.dropna()

#filter data by date
dfMondays = df[((df['TimeStemp'] > '2016-05-02 00:00:00') & (df['TimeStemp'] <= '2016-05-02 23:59:59')) | ((df['TimeStemp'] > '2016-05-9 00:00:00') & (df['TimeStemp'] <= '2016-05-9 23:59:59')) | ((df['TimeStemp'] > '2016-05-16 00:00:00') & (df['TimeStemp'] <= '2016-05-16 23:59:59'))]

# print number of rows and columns
print (dfMondays.shape)

# save data base
dfMondays.to_csv("T3_Mondays.csv", index= False)


(24518, 18)
(4086, 18)


In [298]:
#Catch training and test values from the dataset
from sklearn.model_selection import train_test_split

#featuredColumns = ['GyroscopeStat_x_MEAN','MagneticField_x_MEAN','LinearAcceleration_x_MEAN','MagneticField_COV_z_x']
featuredColumns = ['GyroscopeStat_x_MEAN']
#featuredColumns = ['Pressure_MEAN']
#featuredColumns = ['MagneticField_COV_z_x']

#HERE START NAIVE BAYES LEARNER
X = dfMondays[featuredColumns]
Y = dfMondays[['attack']]

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [299]:
from sklearn.preprocessing import StandardScaler

#Normalize the data
escaler=StandardScaler()
X_train=escaler.fit_transform(X_train)
X_test=escaler.fit_transform(X_test)

In [300]:
#Define the algorithm to use
from sklearn.linear_model import LogisticRegression

algoritm=LogisticRegression(solver='lbfgs')

In [301]:
#Train the model
algoritm.fit(X_train,Y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [302]:
#Prediction
Y_pred_naive_bayes = algoritm.predict(X_test)

In [303]:
from sklearn.metrics import confusion_matrix

#Confusion matrix
matrix = confusion_matrix(Y_test,Y_pred_naive_bayes)
print("Matriz de confusion:")
print(matrix)

Matriz de confusion:
[[817   0]
 [  1   0]]


In [304]:
#Precision of the algorithm
from sklearn.metrics import precision_score 

precision = precision_score(Y_test, Y_pred_naive_bayes, average='weighted', labels= pd.unique(Y_pred_naive_bayes))
print("Precision del modelo:")
print(precision)

Precision del modelo:
0.9987775061124695


In [305]:
#HERE START DECISION TREE LEARNER
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

sqlCtx = SQLContext(sc)

#Convert the dataframe from pandas to sql
dfMondays = sqlCtx.createDataFrame(dfMondays)

In [306]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

#Merge selected columns into one
assembler = VectorAssembler(inputCols = featuredColumns, outputCol="features")
assembled = assembler.transform(dfMondays)

#Catch train and test data from chosen columns
(trainingData, testData) = assembled.randomSplit([0.8,0.2], seed=123123) 

trainingData.count(), testData.count()

(3289, 797)

In [307]:
from pyspark.ml.classification import DecisionTreeClassifier

#Start decision tree clasificator 
dt = DecisionTreeClassifier(labelCol="attack", featuresCol="features", maxDepth=5, minInstancesPerNode=20, impurity="gini")

In [308]:
from pyspark.ml import Pipeline

#Normalize data
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

In [309]:
#Get predictions from the model

predictions = model.transform(testData)

In [310]:
from pyspark.sql.types import DoubleType

#Convert the attack column from int to double to prevent subsequent failure
predictions = predictions.withColumn("attack", predictions["attack"].cast("double"))

In [311]:
#Compare first 10 rows (not necesary)
predictions = predictions.select("prediction","attack")
predictions.show(10)

+----------+------+
|prediction|attack|
+----------+------+
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
+----------+------+
only showing top 10 rows



In [312]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Show success percentage of the model with the test data
evaluator = MulticlassClassificationEvaluator(labelCol="attack", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Accuracy:")
print(accuracy)

Accuracy:
1.0


In [313]:
from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics(predictions.rdd.map(tuple))

In [314]:
#Confusion matrix from the decision tree
metrics.confusionMatrix().toArray().transpose()

array([[797.]])