In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords')

# Step 1: Data Collection
# Assuming we have a list of policies and their corresponding labels
df = pd.read_csv('dataset_testv2.csv', delimiter = ',')

policies = df['original_text']
labels = df['acceptable']

# Step 2: Data Preprocessing and Labeling
# No preprocessing needed as an example, but we can add text cleaning if required

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(decode_error='replace', encoding='utf-8')
X = vectorizer.fit_transform(policies.values.astype('U'))

# Step 4: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=31)

#Cross Validation
hidden_layers = range(1,20)
cv_scores = []
for maxLayer in hidden_layers:
    kt = MLPClassifier(hidden_layer_sizes = maxLayer)
    scores = cross_val_score(kt, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
maxlayers_best = hidden_layers [np.argmax(cv_scores)]
print('cv_scores:', cv_scores)
print('maxLayer_best:',maxlayers_best)

classifier = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=maxlayers_best,learning_rate='adaptive',max_iter=1000, random_state=2)
trained_classifier=classifier.fit(X_train, y_train)
predicted_label = trained_classifier.predict(X_test)

#Evaluation of Trained Model                                    
print('Confusion Matrix')
cfm= confusion_matrix(y_test,predicted_label)
print(cfm)
print('---------')

print('Classification Accuracy')
accuracy=accuracy_score(y_test,predicted_label)
print(accuracy)
print('---------')

print('Precision Score')
precision11 = precision_score(y_test, predicted_label,average='macro')
print(precision11)
print('---------')

print('Recall Score')
recall = recall_score(y_test,predicted_label,average='macro')
print(recall)
print('---------')

print('F1 Score')
F1 = f1_score(y_test,predicted_label,average='macro')
print(F1)
print('---------')

#Prediction
new_policy = "It does not track you!"
X_new = vectorizer.transform([new_policy])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehme\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.












cv_scores: [0.618890977443609, 0.7141604010025062, 0.6981516290726816, 0.693233082706767, 0.7390037593984962, 0.724906015037594, 0.7230889724310777, 0.7336466165413534, 0.7265977443609022, 0.7336779448621554, 0.7266290726817042, 0.7125, 0.7248120300751879, 0.7143170426065163, 0.7213345864661653, 0.72484335839599, 0.7195802005012532, 0.7160714285714286, 0.7178258145363408]
maxLayer_best: 5
Confusion Matrix
[[53 19]
 [17 53]]
---------
Classification Accuracy
0.7464788732394366
---------
Precision Score
0.7466269841269841
---------
Recall Score
0.7466269841269841
---------
F1 Score
0.7464788732394366
---------


