In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv('Enumerated dataset.csv')
df.head()

Unnamed: 0,Age,Education level,Number of use Internet and Internet-related services,Internet/ Digital devices skills level,Secure of person's digital devices,Legitimacy of a website,Password with personal information,Aware of the danger of clicking pop-up screens,Attention to person's account privacy,Protection of social media services for personal info,...,blogs1,etc.1,Government websites (e.g. CERT)1,I do not feel that I keep myself updated1,Rely on automatic updates1,Internet service provider ISPs1,Other- Messages from Communication Company1,Others: Twitter1,Other- Twitter1,etc..1
0,0,3,2,1,3,4,0,4,4,2,...,1,1,0,0,0,0,0,0,0,0
1,0,2,2,1,4,0,4,4,4,3,...,1,1,0,0,0,0,0,0,0,0
2,0,2,2,1,4,3,3,4,4,3,...,1,1,0,0,0,0,0,0,0,0
3,0,2,2,1,4,2,0,3,1,3,...,1,1,0,0,0,0,0,0,0,0
4,0,2,2,1,4,3,1,4,3,2,...,1,1,0,0,0,0,0,0,0,0


In [3]:
# Finding the best features to use for the traning
df.corrwith(df['Victim of cybercrime']).sort_values( ascending=False)

Victim of cybercrime                                                                                 1.000000
Bank                                                                                                 0.145149
Online personal information is not secure enough                                                     0.135610
No one                                                                                               0.109586
Responsibility of those offering online/Internet-based servicesin raising awareness of cybercrime    0.105763
                                                                                                       ...   
I don't know.1                                                                                      -0.086462
Don’t know but will ask my friends for advice                                                       -0.106505
Well protected against cybercrime                                                                   -0.106858
Laws are e

#### Pick the best correlation values, test the best number of combination

In [4]:
# As we have too many columns, correlation map will not be useful
#corr = df.corr()
#corr.style.background_gradient(cmap='coolwarm')

In [5]:
# Finding the best features to use for the traning and store it
corrArray=df.corrwith(df['Victim of cybercrime']).sort_values( ascending=False)
# pick the best 10 columns. 5 from the top and 5 from the last
top5=corrArray[1:6]
top5= list(top5.index)
top10= list(corrArray[1:11].index)
top15= list(corrArray[1:16].index)
top3= list(corrArray[1:4].index)
top2= list(corrArray[1:3].index)
toplist= [top2, top3, top5, top10, top15]

top5

['Bank',
 'Online personal information is not secure enough',
 'No one',
 'Responsibility of those offering online/Internet-based servicesin raising awareness of cybercrime',
 'Education level']

In [6]:
buttom5=corrArray[-5:]
buttom5= list(buttom5.index)
buttom10= list(corrArray[-10:].index)
buttom15= list(corrArray[-15:].index)
buttom20= list(corrArray[-20:].index)
buttom3= list(corrArray[-3:].index)
buttom2= list(corrArray[-2:].index)
buttomlist= [buttom2, buttom3, buttom5, buttom10, buttom15]
buttom5

["I don't know.1",
 'Don’t know but will ask my friends for advice',
 'Well protected against cybercrime',
 'Laws are effective in managing the cybercrime problem',
 'Report cybercrime']

In [7]:
#Scores lists
logistic_regression= []
Decision_Tree= []
Random_forest= []
svc=[]
ANN=[]
#Cross values lists
logCrossVal= []
DecisionTree_CrossVal= []
RandomForest_crossVal= []
svc_crossVal=[]
ANN_crossval=[]


# Test Models

In [8]:
for i in range (0, len(toplist)):

    # Logistic Regression
    predictors = df[toplist[i] + buttomlist[i]] # training , X
    target = df['Victim of cybercrime'] # prediction , Y

    x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.20, random_state = 0)

    # Building model
    logreg = LogisticRegression()
    logreg.fit(x_train, y_train)

    # getting the prediction
    y_pred=logreg.predict(x_test)

    t=logreg.score(x_train, y_train)
    test= metrics.accuracy_score(y_test, y_pred)
    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, y_pred)
    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, y_pred)

    # Store the results
    logistic_regression.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    #Cross Validation
    scores = cross_val_score(logreg, x_train, y_train, cv=5)
    logCrossVal.append(scores)

    # ####### Decision Tree Classifier ####### #
    
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=3).fit(x_train, y_train)
    # Predicting the y values corresponding to X_test_sm
    clf_pred = clf.predict(x_test)
    t=clf.score(x_train, y_train)

    test= metrics.accuracy_score(y_test, clf_pred)

    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, clf_pred)

    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, clf_pred)

    Decision_Tree.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    # Cross validation
    scores = cross_val_score(clf, x_train, y_train, cv=5)
    DecisionTree_CrossVal.append(scores)
    
    # ####### Random Forest ####### #
    class_forest = RandomForestClassifier(n_estimators = 5, criterion = 'entropy', random_state = 0).fit(x_train, y_train)
    preds_class = class_forest.predict(x_test)
    t= class_forest.score(x_train, y_train)

    test=metrics.accuracy_score(y_test, preds_class)

    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, preds_class)

    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, preds_class)

    Random_forest.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    # cross validation
    scores = cross_val_score(class_forest, x_train, y_train, cv=5)
    RandomForest_crossVal.append(scores)
    
    
    # ####### Support Vector Classifair ####### #
    class_sv = SVC(kernel = 'linear', random_state = 0)
    class_sv.fit(x_train, y_train)
    preds_class = class_sv.predict(x_test)
    
    t=class_sv.score(x_train, y_train)

    test=metrics.accuracy_score(y_test, preds_class)

    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, preds_class)

    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, preds_class)

    
    svc.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    # cross validation
    scores = cross_val_score(class_sv, x_train, y_train, cv=5)
    svc_crossVal.append(scores)

## Testing Results

#### Logistic Regression

In [9]:
ind= [2, 3, 5, 10, 15]
print('----Logistic Regression----')
print(pd.DataFrame(logistic_regression,  index =ind))

----Logistic Regression----
    training   testing  precision  recall
2   0.961856  0.958848   0.830508     1.0
3   0.961856  0.958848   0.830508     1.0
5   0.963918  0.958848   0.830508     1.0
10  0.963918  0.958848   0.830508     1.0
15  0.962887  0.958848   0.830508     1.0


#### Decision Tree Classifair

In [10]:
print('----Decision Tree Classifair----')
print(pd.DataFrame(Decision_Tree,  index =ind))

----Decision Tree Classifair----
    training   testing  precision  recall
2   0.961856  0.958848   0.830508     1.0
3   0.961856  0.958848   0.830508     1.0
5   0.967010  0.958848   0.830508     1.0
10  0.967010  0.958848   0.830508     1.0
15  0.967010  0.958848   0.830508     1.0


#### Random forest

In [11]:
print('----Random forest----')
print(pd.DataFrame(Random_forest,  index =ind))

----Random forest----
    training   testing  precision    recall
2   0.964948  0.954733   0.839286  0.959184
3   0.967010  0.962963   0.884615  0.938776
5   0.987629  0.946502   0.833333  0.918367
10  0.994845  0.958848   0.867925  0.938776
15  0.992784  0.958848   0.842105  0.979592


#### Support Vector Classifair

In [12]:
print('----Support Vector Classifair----')
print(pd.DataFrame(svc,  index =ind))

----Support Vector Classifair----
    training   testing  precision  recall
2   0.961856  0.958848   0.830508     1.0
3   0.961856  0.958848   0.830508     1.0
5   0.961856  0.958848   0.830508     1.0
10  0.961856  0.958848   0.830508     1.0
15  0.961856  0.958848   0.830508     1.0


## Cross Validation Results

In [13]:
# Find the mean and standard deviation
print('----Logistic Regression----')
logdf=pd.DataFrame(logCrossVal,  index = ind)
logdf['mean()']= logdf.mean(axis=1)
logdf['std()']= logdf.std(axis=1)
print(logdf)

print('\n----Decision Tree Classifair----')
DTdf= pd.DataFrame(DecisionTree_CrossVal,  index =ind)
DTdf['mean()']= DTdf.mean(axis=1)
DTdf['std()']= DTdf.std(axis=1)
print(DTdf)

print('\n----Random forest----')
Randf= pd.DataFrame(RandomForest_crossVal,  index =ind)
Randf['mean()']= Randf.mean(axis=1)
Randf['std()']= Randf.std(axis=1)
print(Randf)

print('\n----Support Vector Classifair----')
svcdf= pd.DataFrame(svc_crossVal,  index =ind)
svcdf['mean()']= svcdf.mean(axis=1)
svcdf['std()']= svcdf.std(axis=1)
print(svcdf)



----Logistic Regression----
           0         1         2         3         4    mean()     std()
2   0.953608  0.974227  0.963918  0.958763  0.958763  0.961856  0.006992
3   0.953608  0.974227  0.958763  0.958763  0.958763  0.960825  0.006992
5   0.953608  0.979381  0.969072  0.958763  0.958763  0.963918  0.009221
10  0.953608  0.979381  0.958763  0.953608  0.958763  0.960825  0.009560
15  0.948454  0.979381  0.943299  0.953608  0.958763  0.956701  0.012457

----Decision Tree Classifair----
           0         1         2         3         4    mean()     std()
2   0.953608  0.974227  0.963918  0.958763  0.958763  0.961856  0.006992
3   0.953608  0.969072  0.963918  0.958763  0.958763  0.960825  0.005257
5   0.958763  0.979381  0.969072  0.963918  0.953608  0.964948  0.008868
10  0.958763  0.969072  0.953608  0.963918  0.953608  0.959794  0.006011
15  0.958763  0.969072  0.953608  0.958763  0.953608  0.958763  0.005647

----Random forest----
           0         1         2       

## ANN

In [27]:
# Selecting features
# Save X data
predictors = df.drop('Victim of cybercrime',axis=1)

# Encode our target
le = LabelEncoder()
target = le.fit_transform(df['Victim of cybercrime'])

In [28]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size= 0.20, random_state = 0)

In [29]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [30]:
ann = tf.keras.models.Sequential()

In [31]:
# The 1st hidden layer will have 5 units and the activation function is relu

ann.add(tf.keras.layers.Dense(units=5, activation='relu'))

In [32]:
# The 2nd hidden layer will have 5 units and the activation function is relu

ann.add(tf.keras.layers.Dense(units=5, activation='relu'))

In [33]:
# The output layer has only one unit since it is binary classification --> Only 2 states
# The activation function is the sigmoid

ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [34]:

ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [35]:
# epochs represent number of time the modle see the data 
# Increase the number till we HAPPY with the final result !
# Till now ANN is the best way 
# for training

ann.fit(X_train, y_train, batch_size = 32, epochs = 200)

Train on 970 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200


Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200

Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x7f9ba4f9f950>

In [36]:
# epochs represent number of time the modle see the data 
# Increase the number till we HAPPY with the final result !
# Till now ANN is the best way 
# for testing


#ann.fit(X_test, y_test, batch_size = 32, epochs = 50)