In [1]:
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import sklearn.metrics as metrics
import pandas
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')

label_list = ['First Party Collection/Use',
             'Third Party Sharing/Collection',
             'User Choice/Control',
             'User Access, Edit and Deletion',
             'Data Retention',
             'Data Security',
             'Policy Change',
             'Do Not Track',
             'International and Specific Audiences']

def get_data(texts):
    data = []
    labels = []
    
    #separate data and labels
    for line in texts:
        #ignore blank lines
        if line != []:
            #add to list
            data.append(line[0])
            if line[1] in label_list:
                labels.append(label_list.index(line[1]))
            else:
                labels.append(int(line[1]))

    #make vectors
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform(data)
    vector_arr = vectors.toarray()
    
    return vector_arr,labels


#function to give statistics on model accuracy
def stats(true_y,pred_y):
    accuracy = metrics.accuracy_score(true_y,pred_y)
    precision = metrics.precision_score(true_y,pred_y,average='weighted')
    recall = metrics.recall_score(true_y,pred_y,average='weighted')
    f1 = metrics.f1_score(true_y,pred_y,average='weighted')
    return [accuracy,precision,recall,f1]

#get filename
filename = input("Train file name: ")
train_texts = list(csv.reader(open(filename)))
test_filename = input("Test file name: ")
test_texts = list(csv.reader(open(test_filename)))
texts = test_texts + train_texts

#get vectors
train_x,train_y = get_data(texts)
test_x = train_x[:len(test_texts)]
test_y = train_y[:len(test_texts)]
train_x = train_x[len(test_texts):]
train_y = train_y[len(test_texts):]
print(len(test_y))
#declare stats array
headers = ["accuracy","precision","recall","f1"]
rows = ["logistic","bayes","svm"]
statistics = []

#logistic regression
reg = LogisticRegression(max_iter=500)
reg.fit(train_x,train_y)
log_pred = reg.predict(test_x)
log_matrix = metrics.confusion_matrix(test_y,log_pred)
statistics.append(stats(test_y,log_pred))

#bayes classifier
bayes = MultinomialNB()
bayes.fit(train_x,train_y)
bayes_pred = bayes.predict(test_x)
bayes_matrix = metrics.confusion_matrix(test_y,bayes_pred)
statistics.append(stats(test_y,bayes_pred))

#support vector machine
machine = LinearSVC(max_iter=1000)
machine.fit(train_x,train_y)
svm_pred = machine.predict(test_x)
svm_matrix = metrics.confusion_matrix(test_y,svm_pred)
statistics.append(stats(test_y,svm_pred))

#print statistics
print(pandas.DataFrame(statistics,rows,headers))
print()
print("logistic confusion matrix")
print(log_matrix)
print()
print("bayes confusion matrix")
print(bayes_matrix)
print()
print("svm confusion matrix")
print(svm_matrix)

Train file name: alldata.csv
Test file name: iot_clean.csv
257


  _warn_prf(average, modifier, msg_start, len(result))


          accuracy  precision    recall        f1
logistic  0.630350   0.640492  0.630350  0.618871
bayes     0.587549   0.621844  0.587549  0.595979
svm       0.618677   0.633099  0.618677  0.616296

logistic confusion matrix
[[70  4  3  1  2  2  0  0  0]
 [15 40  2  0  2  3  0  0  0]
 [12 10 13  1  0  0  0  0  0]
 [ 6  1  2 11  2  0  0  0  0]
 [ 1  1  1  3  5  0  0  0  0]
 [ 7  0  1  0  0 13  0  0  1]
 [ 0  0  1  0  0  0  2  0  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 5  2  1  1  0  1  0  0  8]]

bayes confusion matrix
[[44  6  3  4 10  8  0  6  1]
 [10 42  3  1  2  3  0  0  1]
 [ 7 11 13  1  0  0  0  2  2]
 [ 4  0  2 12  1  0  0  2  1]
 [ 0  1  0  2  7  1  0  0  0]
 [ 2  1  0  0  0 17  0  1  1]
 [ 0  0  1  0  0  0  2  0  0]
 [ 1  0  0  0  0  0  0  0  0]
 [ 1  2  0  1  0  0  0  0 14]]

svm confusion matrix
[[61  5  6  0  4  5  0  1  0]
 [16 39  5  0  1  1  0  0  0]
 [ 9 10 15  1  0  0  1  0  0]
 [ 4  2  3 11  2  0  0  0  0]
 [ 0  1  0  3  6  1  0  0  0]
 [ 5  0  1  0  0 16  0  0  0]
 [ 0 