In [78]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [41]:
df = pd.read_csv('Enumerated dataset.csv')
df.head()

Unnamed: 0,Age,Education level,Number of use Internet and Internet-related services,Internet/ Digital devices skills level,Secure of person's digital devices,Legitimacy of a website,Password with personal information,Aware of the danger of clicking pop-up screens,Attention to person's account privacy,Protection of social media services for personal info,...,blogs1,etc.1,Government websites (e.g. CERT)1,I do not feel that I keep myself updated1,Rely on automatic updates1,Internet service provider ISPs1,Other- Messages from Communication Company1,Others: Twitter1,Other- Twitter1,etc..1
0,0,3,2,1,3,4,0,4,4,2,...,1,1,0,0,0,0,0,0,0,0
1,0,2,2,1,4,0,4,4,4,3,...,1,1,0,0,0,0,0,0,0,0
2,0,2,2,1,4,3,3,4,4,3,...,1,1,0,0,0,0,0,0,0,0
3,0,2,2,1,4,2,0,3,1,3,...,1,1,0,0,0,0,0,0,0,0
4,0,2,2,1,4,3,1,4,3,2,...,1,1,0,0,0,0,0,0,0,0


In [42]:
# Finding the best features to use for the traning
df.corrwith(df['Victim of cybercrime']).sort_values( ascending=False)

Victim of cybercrime                                                                                 1.000000
Bank                                                                                                 0.145149
Online personal information is not secure enough                                                     0.135610
No one                                                                                               0.109586
Responsibility of those offering online/Internet-based servicesin raising awareness of cybercrime    0.105763
                                                                                                       ...   
I don't know.1                                                                                      -0.086462
Don’t know but will ask my friends for advice                                                       -0.106505
Well protected against cybercrime                                                                   -0.106858
Laws are e

In [43]:
# As we have too many columns, correlation map will not be useful
#corr = df.corr()
#corr.style.background_gradient(cmap='coolwarm')

In [44]:
# Finding the best features to use for the traning and store it
corrArray=df.corrwith(df['Victim of cybercrime']).sort_values( ascending=False)
# pick the best 10 columns. 5 from the top and 5 from the last
top5=corrArray[1:6]
top5= list(top5.index)
top10= list(corrArray[1:11].index)
top15= list(corrArray[1:16].index)
top20= list(corrArray[1:21].index)
top3= list(corrArray[1:4].index)

top5

['Bank',
 'Online personal information is not secure enough',
 'No one',
 'Responsibility of those offering online/Internet-based servicesin raising awareness of cybercrime',
 'Education level']

In [45]:
buttom5=corrArray[-5:]
buttom5= list(buttom5.index)
buttom10= list(corrArray[-10:].index)
buttom15= list(corrArray[-15:].index)
buttom20= list(corrArray[-20:].index)
buttom20= list(corrArray[-20:].index)
buttom3= list(corrArray[-3:].index)
buttom5

["I don't know.1",
 'Don’t know but will ask my friends for advice',
 'Well protected against cybercrime',
 'Laws are effective in managing the cybercrime problem',
 'Report cybercrime']

# Logistic Regression

In [46]:
predictors = df[top5 + buttom5] # training , X
target = df['Victim of cybercrime'] # prediction , Y


x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.20, random_state = 0)


In [47]:
# Building model
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

LogisticRegression()

In [48]:
# getting the prediction
y_pred=logreg.predict(x_test)

In [49]:
print("Training Accuracy:", logreg.score(x_train, y_train))
print("Testing Accuracy:", metrics.accuracy_score(y_test, y_pred))
# precision tp / (tp + fp)
precision = metrics.precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = metrics.recall_score(y_test, y_pred)
print('Recall: %f' % recall)

Training Accuracy: 0.9639175257731959
Testing Accuracy: 0.9588477366255144
Precision: 0.830508
Recall: 1.000000


In [87]:
scores = cross_val_score(logreg, x_train, y_train, cv=5)
scores

array([0.95360825, 0.97938144, 0.96907216, 0.95876289, 0.95876289])

In [88]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.96 accuracy with a standard deviation of 0.01


# Decision Tree Classifier

In [50]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3).fit(x_train, y_train)

In [51]:
# Predicting the y values corresponding to X_test_sm
clf_pred = clf.predict(x_test)

In [52]:
print("Training Accuracy:", clf.score(x_train, y_train))
print("Testing Accuracy:", metrics.accuracy_score(y_test, clf_pred))
# precision tp / (tp + fp)
precision = metrics.precision_score(y_test, clf_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = metrics.recall_score(y_test, clf_pred)
print('Recall: %f' % recall)

Training Accuracy: 0.9670103092783505
Testing Accuracy: 0.9588477366255144
Precision: 0.830508
Recall: 1.000000


In [89]:
scores = cross_val_score(clf, x_train, y_train, cv=5)
scores

array([0.95360825, 0.97938144, 0.96907216, 0.95876289, 0.95876289])

In [90]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.96 accuracy with a standard deviation of 0.01


# Random Forest

In [53]:
class_forest = RandomForestClassifier(n_estimators = 5, criterion = 'entropy', random_state = 0).fit(x_train, y_train)

In [54]:
preds_class = class_forest.predict(x_test)

In [55]:
print("Training Accuracy:", class_forest.score(x_train, y_train))
print("Testing Accuracy:", metrics.accuracy_score(y_test, preds_class))
# precision tp / (tp + fp)
precision = metrics.precision_score(y_test, preds_class)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = metrics.recall_score(y_test, preds_class)
print('Recall: %f' % recall)

Training Accuracy: 0.9876288659793815
Testing Accuracy: 0.9465020576131687
Precision: 0.833333
Recall: 0.918367


In [91]:
scores = cross_val_score(class_forest, x_train, y_train, cv=5)
scores

array([0.94845361, 0.96391753, 0.91237113, 0.95360825, 0.94329897])

In [92]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.94 accuracy with a standard deviation of 0.02


# Support Vector Classification

In [56]:
class_sv = SVC(kernel = 'linear', random_state = 0)

class_sv.fit(x_train, y_train)
preds_class = class_sv.predict(x_test)

In [57]:
print("Training Accuracy:", class_sv.score(x_train, y_train))
print("Testing Accuracy:", metrics.accuracy_score(y_test, preds_class))
# precision tp / (tp + fp)
precision = metrics.precision_score(y_test, preds_class)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = metrics.recall_score(y_test, preds_class)
print('Recall: %f' % recall)

Training Accuracy: 0.9618556701030928
Testing Accuracy: 0.9588477366255144
Precision: 0.830508
Recall: 1.000000


In [93]:
scores = cross_val_score(class_sv, x_train, y_train, cv=5)
scores

array([0.95360825, 0.9742268 , 0.96391753, 0.95876289, 0.95876289])

In [94]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.96 accuracy with a standard deviation of 0.01
