In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Step 1: Data Collection
# Assuming we have a list of policies and their corresponding labels
df = pd.read_csv('dataset_testv2.csv', delimiter = ',')

policies = df['original_text']
labels = df['acceptable']

# Step 2: Data Preprocessing and Labeling
# No preprocessing needed as an example, but we can add text cleaning if required
#policies = df['original_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(decode_error='replace', encoding='utf-8')
X = vectorizer.fit_transform(policies.values.astype('U'))

# Step 4: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

classifier = DecisionTreeClassifier()
trained_classifier=classifier.fit(X_train, y_train)
predicted_label = trained_classifier.predict(X_test)

#Evaluation of Trained Model                                    
print('Confusion Matrix')
cfm= confusion_matrix(y_test,predicted_label)
print(cfm)
print('---------')

print('Classification Accuracy')
accuracy=accuracy_score(y_test,predicted_label)
print(accuracy)
print('---------')

print('Precision Score')
precision11 = precision_score(y_test, predicted_label,average='macro')
print(precision11)
print('---------')

print('Recall Score')
recall = recall_score(y_test,predicted_label,average='macro')
print(recall)
print('---------')

print('F1 Score')
F1 = f1_score(y_test,predicted_label,average='macro')
print(F1)
print('---------')

#Prediction
new_policy = "It does not track you!"
X_new = vectorizer.transform([new_policy])

Confusion Matrix
[[39 23]
 [29 51]]
---------
Classification Accuracy
0.6338028169014085
---------
Precision Score
0.6313593004769475
---------
Recall Score
0.6332661290322581
---------
F1 Score
0.631168831168831
---------
