In [28]:
import numpy as np
import pandas as pd
import scipy
import skmultilearn
import os
import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import gensim
import glob
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib
import random
import torch
import torch.nn as nn
nltk.download("punkt");
from argparse import Namespace
import json
from numpyencoder import NumpyEncoder
import pickle

[nltk_data] Downloading package punkt to /Users/kaushik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
import sys
sys.path.insert(0, "/Users/kaushik/MyStuff/Workspace/NEU/DS5500/Project/DS5500_CapstoneProject")
import os

In [30]:
from src.config import cfg
from src.data import prepOPPCorpus
from src.data import preprocess

from src.models import CNN
from src import models

from src.main import main
from src.main import driver

from src.utils import gen
from src.utils import metrics
from src.utils import embeddings

import itertools
import numpy as np
import pandas as pd

In [31]:
# Define experiment name
experiment_name = "CNN_W_FE_U_MF5_50"
#experiment_name = "CNN_W_FE_M"

In [32]:
experiment_dpath = os.path.join(cfg.PARAM.BEST_PARAM_DPATH, "best_params_" + experiment_name)
run_id = gen.loadID(os.path.join(experiment_dpath, "run_ID.txt"))

In [33]:
param_dict = gen.loadParams(os.path.join(experiment_dpath, "best_param_dict.json"))
params = Namespace(**param_dict)

In [34]:
run_id

'741d584255d04d6baa70b0c7839124aa'

In [35]:
text = ["When You access the Service by or through a mobile device, We may collect certain information automatically, " \
"including, but not limited to, the type of mobile device You use, Your mobile device unique ID, the IP address of" \
" Your mobile device, Your mobile operating system, the type of mobile Internet browser You use, unique device identifiers " \
"and other diagnostic data.", "When You access the Service by or through a mobile device, We may collect certain information automatically, " \
"including, but not limited to, the type of mobile device You use, Your mobile device unique ID, the IP address of" \
" Your mobile device, Your mobile operating system, the type of mobile Internet browser You use, unique device identifiers " \
"and other diagnostic data.", "When You access the Service by or through a mobile device, We may collect certain information automatically, " \
"including, but not limited to, the type of mobile device You use, Your mobile device unique ID, the IP address of" \
" Your mobile device, Your mobile operating system, the type of mobile Internet browser You use, unique device identifiers " \
"and other diagnostic data."]

In [37]:
import mlflow

In [38]:
experiment_id = mlflow.get_run(run_id=run_id).info.experiment_id
artifacts_dir = os.path.join(cfg.MLFLOW.MODEL_REGISTRY, experiment_id, run_id, "artifacts")

In [39]:
artifacts_dir

'/Users/kaushik/MyStuff/Workspace/NEU/DS5500/Project/DS5500_CapstoneProject/mlflow_registry/5/741d584255d04d6baa70b0c7839124aa/artifacts'

# Get optimal thresholds

In [9]:
artifacts = driver.loadRunArtifacts(run_id=run_id)

In [10]:
gen.setSeeds(seed=params.seed)

In [11]:
device = gen.setDevice(cuda=params.cuda)

In [12]:
params = artifacts["params"]
model = artifacts["model"]
tokenizer = artifacts["tokenizer"]
label_encoder = artifacts["label_encoder"]
model = model.to(device)
classes = label_encoder.classes

In [13]:
df = gen.loadDataset(cfg)

🟢 Dataset loaded!


In [14]:
df.segment_text = df.segment_text.apply(preprocess.cleanText, lower=params.lower, stem=params.stem)

In [15]:
cats = df.category
y = label_encoder.encode(df.category)

In [16]:
cats_list = list(itertools.chain.from_iterable(cats.values))
counts = np.bincount([label_encoder.class_to_index[cat_] for cat_ in cats_list])
cat_weights = {i: 1.0 / count for i, count in enumerate(counts)}

In [17]:
gen.setSeeds(seed=params.seed)

In [18]:
X = df.segment_text.to_numpy()
X_train, X_, y_train, y_ = preprocess.train_test_split_multilabel(
X=X, y=y, train_size=params.train_size
)
X_val, X_test, y_val, y_test = preprocess.train_test_split_multilabel(X=X_, y=y_, train_size=0.5)
test_df = pd.DataFrame({"segment_text": X_test, "category": label_encoder.decode(y_test)})

In [19]:
print("-" * 60)
print("Successfully split the dataset into {:g}% train, {:g}% val and {:g}% test!".format((params.train_size)*100, (1-params.train_size)/2*100, (1-params.train_size)/2*100))
print("Number of unique segments in total: {}".format(X.shape[0]))
metrics.splitStatistics(splitlist=[X_train, X_val, X_test, y_train, y_val, y_test])

------------------------------------------------------------
Successfully split the dataset into 70% train, 15% val and 15% test!
Number of unique segments in total: 3471
TRAIN SET
Number of unique segments: 2422
Percentage of segments containing each of the following categories:
                                      Counts Percentage
Data Retention                            55      1.91%
Data Security                            147       5.1%
Do Not Track                              22      0.76%
First Party Collection/Use               845      29.3%
International and Specific Audiences     211      7.32%
Introductory/Generic                     273      9.47%
Policy Change                             83      2.88%
Practice not covered                      90      3.12%
Privacy contact information              142      4.92%
Third Party Sharing/Collection           661     22.92%
User Access, Edit and Deletion           104      3.61%
User Choice/Control                      251   

In [20]:
X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

In [21]:
train_dataset = CNN.CNNDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size)
val_dataset = CNN.CNNDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size)
train_dataloader = train_dataset.create_dataloader(batch_size=params.batch_size)
val_dataloader = val_dataset.create_dataloader(batch_size=params.batch_size)

In [22]:
print(f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}")
cat_weights_tensor = torch.Tensor(np.array(list(cat_weights.values())))
loss_fn = nn.BCEWithLogitsLoss(weight=cat_weights_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.05, patience=5)

Parameters: {
  "dataset": "majority",
  "seed": 2021,
  "cuda": true,
  "lower": true,
  "stem": false,
  "train_size": 0.7,
  "char_level": false,
  "max_filter_size": 5,
  "batch_size": 128,
  "embedding_dim": 204,
  "num_filters": 209,
  "hidden_dim": 484,
  "dropout_p": 0.7482758802337606,
  "lr": 0.0003189955986529326,
  "num_epochs": 200,
  "embed": null,
  "freeze_embed": false,
  "patience": 10,
  "threshold": 0.4383414089679718,
  "num_samples": 3471
}


In [23]:
trainer = models.Trainer(model=model, device=device, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler, trial=None)

In [24]:
_, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader)

In [25]:
thresholds, multi_thresholds = metrics.getOptimalTreshold(y_true=y_true, y_prob=y_prob)

In [26]:
multi_thresholds

[0.3721136,
 0.39934087,
 0.74468327,
 0.3627222,
 0.27153248,
 0.19452152,
 0.15533948,
 0.09685168,
 0.1824223,
 0.2859817,
 0.42126584,
 0.24379988]

In [27]:
with open("{}.pkl".format(experiment_name), "wb") as f:
    pickle.dump(multi_thresholds, f)

In [44]:
segments_processed = ["If you use a Microsoft product with an account provided by an organization you are affiliated with, such as your work or school account, that organization can",
"Control and administer your Microsoft product and product account, including controlling privacy-related settings of the product or product account.",
"Access and process your data, including the interaction data, diagnostic data, and the contents of your communications and files associated with your Microsoft product and product accounts.",
"If you lose access to your work or school account (in event of change of employment, for example), you may lose access to products and the content associated with those products, including those you acquired on your own behalf, if you used your work or school account to sign in to such products.",
"Many Microsoft products are intended for use by organizations, such as schools and businesses. Please see the Enterprise and developer products section of this privacy statement. If your organization provides you with access to Microsoft products, your use of the Microsoft products is subject to your organization's policies, if any. You should direct your privacy inquiries, including any requests to exercise your data protection rights, to your organization’s administrator. When you use social features in Microsoft products, other users in your network may see some of your activity. To learn more about the social features and other functionality, please review documentation or help content specific to the Microsoft product. Microsoft is not responsible for the privacy or security practices of our customers, which may differ from those set forth in this privacy statement.",
"When you use a Microsoft product provided by your organization, Microsoft’s processing of your personal data in connection with that product is governed by a contract between Microsoft and your organization. Microsoft processes your personal data to provide the product to your organization and you, and in some cases for Microsoft’s business operations related to providing the product as described in the Enterprise and developer products section. As mentioned above, if you have questions about Microsoft’s processing of your personal data in connection with providing products to your organization, please contact your organization. If you have questions about Microsoft’s business operations in connection with providing products to your organization as provided in the Product Terms, please contact Microsoft as described in the How to contact us section. For more information on our business operations, please see the Enterprise and developer products section.","For Microsoft products provided by your K-12 school, including Microsoft 365 Education, Microsoft will:"]

In [45]:
driver.productionPredict(segments_processed, run_id, multi_threshold = True)

IndexError: index 7 is out of bounds for axis 0 with size 7