In [73]:
import numpy as np
import pandas as pd
import scipy
import skmultilearn
import os
import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import gensim
import glob
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib
import random
import torch
import torch.nn as nn
nltk.download("punkt");

[nltk_data] Downloading package punkt to /Users/kaushik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Path variables

In [78]:
data_root_dpath = os.path.join("..","..", "OPP-115")

processed_data_dpath = os.path.join(data_root_dpath, "processed_data")

majority_dataset_fpath = os.path.join(processed_data_dpath, "master_catmodel_dataset_majority.csv")

union_dataset_fpath = os.path.join(processed_data_dpath, "master_catmodel_dataset_union.csv")

# Embeddings

In [79]:
df = pd.read_csv(majority_dataset_fpath)
df

Unnamed: 0,segment_text,category
0,Privacy Policy Sci-News.com is committed to ...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
1,Information that Sci-News.com May Collect Onli...,[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2,"- if you contact us, we may keep a record of t...",[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
3,- details of your visits to our site including...,[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
4,Sci-News.com does not knowingly collect or sol...,[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1.]
...,...,...
3466,Information Collected at Mohegan Sun In addi...,[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
3467,Management of User Information Certain infor...,[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
3468,Special Note About Children This site is not...,[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
3469,If You Have a Question If you have a questio...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [80]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [81]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaushik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [125]:
def preprocess(text, stopwords=STOPWORDS):
    """
        Clean input text
        :param: text: text string
        :return: cleaned text
    """
    # Lower
    text = text.lower()

    # Remove stopwords
    #pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    #text = pattern.sub("", text)

    # Remove words in paranthesis
    #text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    #text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    #text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    #encoded_string = t.encode("ascii", "ignore")
    #t = encoded_string.decode()
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = re.sub(r'//.*', '', text)
    text = text.strip()

    return text

In [126]:
preprocessed_df = df.copy()
preprocessed_df.segment_text = preprocessed_df.segment_text.apply(preprocess)
print (f"{df.segment_text.values[0]}\n\n{preprocessed_df.segment_text.values[0]}")

Privacy Policy   Sci-News.com is committed to protecting and respecting your privacy. To better inform you of our policy concerning user privacy, we have adopted the following terms. Please note that these terms are subject to change, and any such changes will be included on this page.  

privacy policy sci-news.com is committed to protecting and respecting your privacy. to better inform you of our policy concerning user privacy, we have adopted the following terms. please note that these terms are subject to change, and any such changes will be included on this page.


In [127]:
def Corpus2Tokens(corpus_df, clean = False):
    """
        Convert OPP-115 corpus into a dictionary of tokens with indices
        :param: cfg: config variable
        :param: read_pickle: read from saved pickle object
        :return: dictionary with keys and values as words and indices
    """
    token_set = set()
    for i, r in corpus_df.iterrows():
        segment = corpus_df.iloc[i,0]
        token_set = token_set.union({token.lower() for token in nltk.word_tokenize(segment)})

    token_list = sorted(token_set)

    corpus_tokens_idx = {None: 0}

    for idx, token in enumerate(token_list,1):

        corpus_tokens_idx[token] = idx

    return corpus_tokens_idx

In [128]:
len(Corpus2Tokens(preprocessed_df))

6461

In [129]:
Corpus2Tokens(preprocessed_df)

{None: 0,
 '!': 1,
 '$': 2,
 '%': 3,
 '&': 4,
 "'": 5,
 "''": 6,
 "'cookies": 7,
 "'d": 8,
 "'do": 9,
 "'email": 10,
 "'find": 11,
 "'ll": 12,
 "'mrs": 13,
 "'my": 14,
 "'opt-out": 15,
 "'personally": 16,
 "'profile": 17,
 "'re": 18,
 "'s": 19,
 "'secure": 20,
 "'services": 21,
 "'shared": 22,
 "'spoofed": 23,
 "'stay": 24,
 "'surfing": 25,
 "'ve": 26,
 "'www.companyx.com": 27,
 '(': 28,
 ')': 29,
 '*': 30,
 '+': 31,
 '+1': 32,
 ',': 33,
 '-': 34,
 '--': 35,
 '-for': 36,
 '-our': 37,
 '-that': 38,
 '-to': 39,
 '.': 40,
 '..': 41,
 '.com': 42,
 '.edu': 43,
 '.flash': 44,
 '.gov': 45,
 '.mil': 46,
 '.net': 47,
 '.org': 48,
 '/': 49,
 '0.75': 50,
 '0160': 51,
 '06382': 52,
 '07': 53,
 '08': 54,
 '1': 55,
 '1-201-559-3882': 56,
 '1-3': 57,
 '1-800-345-7669': 58,
 '1-800-543-5335': 59,
 '1-800-658-8372': 60,
 '1-800-767-3771': 61,
 '1-800-824-2665': 62,
 '1-800-843-2665': 63,
 '1-800-856-2518': 64,
 '1-800-883-8895': 65,
 '1-800-allstate': 66,
 '1-800-nytimes': 67,
 '1-800-the-book': 68,
 '