In [28]:
# load data

import json

data = None
with open('./classifier_data.json') as file :
    data = json.load(file)

allChunks = list()
for policy in data :
    allChunks.extend(policy['chunks'])
    
labels = None
with open('./labels.json') as file :
    labels = json.load(file)

In [48]:
# logistic regression classifier

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

def logisticRegression(labelName, data, printRes) :
    df = pd.DataFrame(data)
    if labelName == 'important' :
        df['important'] = df['important'].map({ True: 1, False: 0 })
    
    X = df['text']
    y = df[labelName]

    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X_tfidf = vectorizer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    if printRes :
        print(f'Classification Report for {labelName}:')
        print(classification_report(y_test, y_pred))
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
    
    return model

logisticRegression('important', allChunks, True)
print('\n-----------------------------------------------------------\n')
logisticRegression('label', allChunks, True)

Classification Report for important:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90       528
           1       0.74      0.48      0.58       164

    accuracy                           0.84       692
   macro avg       0.80      0.71      0.74       692
weighted avg       0.83      0.84      0.82       692

Accuracy: 0.84

-----------------------------------------------------------

Classification Report for label:
                             precision    recall  f1-score   support

                Advertising       0.69      0.60      0.64        15
          Children's policy       1.00      0.88      0.93         8
                    Contact       0.68      0.62      0.65        21
                    Cookies       0.65      0.81      0.72        21
     Data in other counries       0.75      0.35      0.48        17
             Data retention       0.81      0.76      0.79        17
                Data rights       0.72    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [64]:
chunksByLabel = dict()
for label in labels :
    chunksByLabel[label] = list()
for policy in data :
    for chunk in policy['chunks'] :
        chunksByLabel[chunk['label']].append(chunk)

importanceModels = dict()
for label, chunks in chunksByLabel.items() :
    print(label)
    print(len(chunks))
    try :
        models[label] = logisticRegression('important', chunks, True)
    except :
        print('error')
    print('\n-----------------------------------------------------------\n')

Intro
131
error

-----------------------------------------------------------

TOC
13
error

-----------------------------------------------------------

Modification
72
Classification Report for important:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       0.00      0.00      0.00         1

    accuracy                           0.93        15
   macro avg       0.47      0.50      0.48        15
weighted avg       0.87      0.93      0.90        15

Accuracy: 0.93

-----------------------------------------------------------

Personal data
320
Classification Report for important:
              precision    recall  f1-score   support

           0       0.94      0.58      0.71        26
           1       0.77      0.97      0.86        38

    accuracy                           0.81        64
   macro avg       0.85      0.78      0.79        64
weighted avg       0.84      0.81      0.80        64

Accuracy

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [None]:
import numpy as np
from sklearn.metrics import f1_score

labelModel = logisticRegression(allChunks, 'label', False)

df = pd.DataFrame(allChunks)
df['important'] = df['important'].map({ True: 1, False: 0 })
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

predicted_importance = []

for text in X_test :
    predicted_identifier = labelsModel.predict([text])[0]
    
    importance_model = importanceModels.get(predicted_identifier)
    if importance_model is not None :
        importance_prediction = importanceModel.predict([text])[0]
    else :
        print(predictied_identifier)
        # importance_prediction = 0

    predicted_importance.append(importance_prediction)

# Step 3: Compute the F1 score
f1 = f1_score(y_test_importance, predicted_importance)
print(f"F1 Score for Important vs. Not Important: {f1:.2f}")

In [26]:
# neural network classifier

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, GlobalAveragePooling1D, Dropout
from tensorflow.keras.utils import to_categorical

def NN(labelName) :
    df = pd.DataFrame(allChunks)
    if labelName == 'important' :
        df['important'] = df['important'].map({ True: 1, False: 0 })
        y = df[labelName]
    else :
        label_mapping = dict()
        for i, label in enumerate(labels) :
            label_mapping[label] = i
        df['label'] = df['label'].map(label_mapping)
        y = to_categorical(df['label'], num_classes=len(labels))
    
    X = df['text']

    MAX_NUM_WORDS = 100000
    MAX_SEQUENCE_LENGTH = 200

    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='<OOV>')
    tokenizer.fit_on_texts(X)
    X_sequences = tokenizer.texts_to_sequences(X)

    X_padded = pad_sequences(X_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
    
    model = None
    if labelName == 'important' :
        model = Sequential([
            Embedding(input_dim=MAX_NUM_WORDS, output_dim=128),
            GlobalAveragePooling1D(),
            Dense(64, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    else :
        model = Sequential([
            Embedding(input_dim=MAX_NUM_WORDS, output_dim=128),
            GlobalAveragePooling1D(),  # Can also try GRU or a simple dense layer
            Dropout(0.5),
            Dense(64, activation='relu'),
            Dense(len(labels), activation='softmax')
        ])
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=10,
        batch_size=32,
        verbose=1
    )
    
    loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
    print(f'Test Accuracy: {accuracy:.2f}')
    y_pred = (model.predict(X_test) > 0.5).astype('int32')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# NN('important')
NN('label')

  arr = np.asarray(values, dtype=dtype)


Epoch 1/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.1477 - loss: 3.0811 - val_accuracy: 0.2211 - val_loss: 2.8477
Epoch 2/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.1709 - loss: 2.8437 - val_accuracy: 0.2139 - val_loss: 2.6987
Epoch 3/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.1994 - loss: 2.6949 - val_accuracy: 0.2558 - val_loss: 2.6245
Epoch 4/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.2146 - loss: 2.6189 - val_accuracy: 0.2847 - val_loss: 2.5799
Epoch 5/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.2508 - loss: 2.5236 - val_accuracy: 0.2977 - val_loss: 2.4892
Epoch 6/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.2801 - loss: 2.4209 - val_accuracy: 0.3136 - val_loss: 2.4197
Epoch 7/10
[1m87/87[0m [32m━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
