In [37]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [38]:
df = pd.read_csv('Enumerated dataset.csv')
df.head()

Unnamed: 0,Age,Education level,Number of use Internet and Internet-related services,Internet/ Digital devices skills level,Secure of person's digital devices,Legitimacy of a website,Password with personal information,Aware of the danger of clicking pop-up screens,Attention to person's account privacy,Protection of social media services for personal info,...,blogs1,etc.1,Government websites (e.g. CERT)1,I do not feel that I keep myself updated1,Rely on automatic updates1,Internet service provider ISPs1,Other- Messages from Communication Company1,Others: Twitter1,Other- Twitter1,etc..1
0,0,3,2,1,3,4,0,4,4,2,...,1,1,0,0,0,0,0,0,0,0
1,0,2,2,1,4,0,4,4,4,3,...,1,1,0,0,0,0,0,0,0,0
2,0,2,2,1,4,3,3,4,4,3,...,1,1,0,0,0,0,0,0,0,0
3,0,2,2,1,4,2,0,3,1,3,...,1,1,0,0,0,0,0,0,0,0
4,0,2,2,1,4,3,1,4,3,2,...,1,1,0,0,0,0,0,0,0,0


In [39]:
# Finding the best features to use for the traning
df.corrwith(df['Victim of cybercrime']).sort_values( ascending=False)

Victim of cybercrime                                                                                 1.000000
Bank                                                                                                 0.145149
Online personal information is not secure enough                                                     0.135610
No one                                                                                               0.109586
Responsibility of those offering online/Internet-based servicesin raising awareness of cybercrime    0.105763
                                                                                                       ...   
I don't know.1                                                                                      -0.086462
Don’t know but will ask my friends for advice                                                       -0.106505
Well protected against cybercrime                                                                   -0.106858
Laws are e

#### Pick the best correlation values, test the best number of combination

In [40]:
# As we have too many columns, correlation map will not be useful
#corr = df.corr()
#corr.style.background_gradient(cmap='coolwarm')

In [41]:
# Finding the best features to use for the traning and store it
corrArray=df.corrwith(df['Victim of cybercrime']).sort_values( ascending=False)
# pick the best 10 columns. 5 from the top and 5 from the last
top5=corrArray[1:6]
top5= list(top5.index)
top10= list(corrArray[1:11].index)
top15= list(corrArray[1:16].index)
top3= list(corrArray[1:4].index)
top2= list(corrArray[1:3].index)
toplist= [top2, top3, top5, top10, top15]

top5

['Bank',
 'Online personal information is not secure enough',
 'No one',
 'Responsibility of those offering online/Internet-based servicesin raising awareness of cybercrime',
 'Education level']

In [42]:
buttom5=corrArray[-5:]
buttom5= list(buttom5.index)
buttom10= list(corrArray[-10:].index)
buttom15= list(corrArray[-15:].index)
buttom20= list(corrArray[-20:].index)
buttom3= list(corrArray[-3:].index)
buttom2= list(corrArray[-2:].index)
buttomlist= [buttom2, buttom3, buttom5, buttom10, buttom15]
buttom5

["I don't know.1",
 'Don’t know but will ask my friends for advice',
 'Well protected against cybercrime',
 'Laws are effective in managing the cybercrime problem',
 'Report cybercrime']

In [43]:
#Scores lists
logistic_regression= []
Decision_Tree= []
Random_forest= []
svc=[]
#Cross values lists
logCrossVal= []
DecisionTree_CrossVal= []
RandomForest_crossVal= []
svc_crossVal=[]


# Test Models

In [44]:
for i in range (0, len(toplist)):

    # Logistic Regression
    predictors = df[toplist[i] + buttomlist[i]] # training , X
    target = df['Victim of cybercrime'] # prediction , Y

    x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.20, random_state = 0)

    # Building model
    logreg = LogisticRegression()
    logreg.fit(x_train, y_train)

    # getting the prediction
    y_pred=logreg.predict(x_test)

    t=logreg.score(x_train, y_train)
    test= metrics.accuracy_score(y_test, y_pred)
    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, y_pred)
    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, y_pred)

    # Store the results
    logistic_regression.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    #Cross Validation
    scores = cross_val_score(logreg, x_train, y_train, cv=5)
    logCrossVal.append(scores)
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


    # ####### Decision Tree Classifier ####### #
    
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=3).fit(x_train, y_train)
    # Predicting the y values corresponding to X_test_sm
    clf_pred = clf.predict(x_test)
    t=clf.score(x_train, y_train)

    test= metrics.accuracy_score(y_test, clf_pred)

    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, clf_pred)

    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, clf_pred)

    Decision_Tree.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    # Cross validation
    scores = cross_val_score(clf, x_train, y_train, cv=5)
    DecisionTree_CrossVal.append(scores)
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    
    # ####### Random Forest ####### #
    class_forest = RandomForestClassifier(n_estimators = 5, criterion = 'entropy', random_state = 0).fit(x_train, y_train)
    preds_class = class_forest.predict(x_test)
    t= class_forest.score(x_train, y_train)

    test=metrics.accuracy_score(y_test, preds_class)

    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, preds_class)

    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, preds_class)

    Random_forest.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    # cross validation
    scores = cross_val_score(class_forest, x_train, y_train, cv=5)
    RandomForest_crossVal.append(scores)
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    
    
    # ####### Support Vector Classifair ####### #
    class_sv = SVC(kernel = 'linear', random_state = 0)
    class_sv.fit(x_train, y_train)
    preds_class = class_sv.predict(x_test)
    
    t=class_sv.score(x_train, y_train)

    test=metrics.accuracy_score(y_test, preds_class)

    # precision tp / (tp + fp)
    precision = metrics.precision_score(y_test, preds_class)

    # recall: tp / (tp + fn)
    recall = metrics.recall_score(y_test, preds_class)

    
    svc.append({'training':t,
                                'testing': test,
                                'precision':precision,
                                'recall': recall  })

    # cross validation
    scores = cross_val_score(class_sv, x_train, y_train, cv=5)
    svc_crossVal.append(scores)
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.95 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.94 accuracy with a standard deviation of 0.02
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.95 accuracy with a standard deviation of 0.02
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01
0.95 accuracy with a standard deviation of 0.01
0.96 accuracy with a standard deviation of 0.01


## Testing Results

#### Logistic Regression

In [45]:
print('----Logistic Regression----')
print(pd.DataFrame(logistic_regression,  index =[2, 3, 5, 10, 15]))

----Logistic Regression----
    training   testing  precision  recall
2   0.961856  0.958848   0.830508     1.0
3   0.961856  0.958848   0.830508     1.0
5   0.963918  0.958848   0.830508     1.0
10  0.963918  0.958848   0.830508     1.0
15  0.962887  0.958848   0.830508     1.0


#### Decision Tree Classifair

In [46]:
print('----Decision Tree Classifair----')
print(pd.DataFrame(Decision_Tree,  index =[2, 3, 5, 10, 15]))

----Decision Tree Classifair----
    training   testing  precision  recall
2   0.961856  0.958848   0.830508     1.0
3   0.961856  0.958848   0.830508     1.0
5   0.967010  0.958848   0.830508     1.0
10  0.967010  0.958848   0.830508     1.0
15  0.967010  0.958848   0.830508     1.0


#### Random forest

In [47]:
print('----Random forest----')
print(pd.DataFrame(Random_forest,  index =[2, 3, 5, 10, 15]))

----Random forest----
    training   testing  precision    recall
2   0.964948  0.954733   0.839286  0.959184
3   0.967010  0.962963   0.884615  0.938776
5   0.987629  0.946502   0.833333  0.918367
10  0.994845  0.958848   0.867925  0.938776
15  0.992784  0.958848   0.842105  0.979592


#### Support Vector Classifair

In [48]:
print('----Support Vector Classifair----')
print(pd.DataFrame(svc,  index =[2, 3, 5, 10, 15]))

----Support Vector Classifair----
    training   testing  precision  recall
2   0.961856  0.958848   0.830508     1.0
3   0.961856  0.958848   0.830508     1.0
5   0.961856  0.958848   0.830508     1.0
10  0.961856  0.958848   0.830508     1.0
15  0.961856  0.958848   0.830508     1.0


## Cross Validation Results

In [49]:
print('----Logistic Regression----')
print(pd.DataFrame(logCrossVal,  index =[2, 3, 5, 10, 15]))
print('\n----Decision Tree Classifair----')
print(pd.DataFrame(DecisionTree_CrossVal,  index =[2, 3, 5, 10, 15]))
print('\n----Random forest----')
print(pd.DataFrame(RandomForest_crossVal,  index =[2, 3, 5, 10, 15]))
print('\n----Support Vector Classifair----')
print(pd.DataFrame(svc_crossVal,  index =[2, 3, 5, 10, 15]))

----Logistic Regression----
           0         1         2         3         4
2   0.953608  0.974227  0.963918  0.958763  0.958763
3   0.953608  0.974227  0.958763  0.958763  0.958763
5   0.953608  0.979381  0.969072  0.958763  0.958763
10  0.953608  0.979381  0.958763  0.953608  0.958763
15  0.948454  0.979381  0.943299  0.953608  0.958763

----Decision Tree Classifair----
           0         1         2         3         4
2   0.953608  0.974227  0.963918  0.958763  0.958763
3   0.953608  0.969072  0.963918  0.958763  0.958763
5   0.958763  0.979381  0.969072  0.963918  0.953608
10  0.958763  0.969072  0.953608  0.963918  0.953608
15  0.958763  0.969072  0.953608  0.958763  0.953608

----Random forest----
           0         1         2         3         4
2   0.953608  0.969072  0.953608  0.953608  0.958763
3   0.948454  0.963918  0.932990  0.958763  0.953608
5   0.948454  0.963918  0.912371  0.953608  0.943299
10  0.958763  0.974227  0.927835  0.948454  0.958763
15  0.958763  