In [1]:
import pandas as pd
import numpy as np
import subprocess
import pickle
import os
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

np.__version__

'1.20.0'

# Download raw data from Hugging Face

In [2]:
# from datasets import load_dataset
# dataset = load_dataset("yahoo_answers_topics")
# dataset['train'].to_csv("raw_data/train.csv")
# dataset['test'].to_csv("raw_data/test.csv")

# Lemmatize sentence

In [3]:
def nltk_pos_tagger(nltk_tag):
    """Add tag about the grammatical category of each word"""
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    """Lemmatize sentence. The returned sentence contains letters only. Other characters are removed."""
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # include letters only r'[a-zA-Z]+'; r'\w+'
    nltk_tagged = nltk.pos_tag(tokenizer.tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    lemmatized_sentence = []
    
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def save_obj(obj, name):
    """save as .pickle"""
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    """load .pickle"""
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
# Example sentence
lemmatizer = WordNetLemmatizer()
lemmatize_sentence("If the light had not been turned off, the house would have catched on fire.")

'If the light have not be turn off the house would have catch on fire'

In [5]:
def preprocessing_lemmatize(file_path, output_prefix, row_per_time):
    """Lemmatize and save sentences in batches."""
    skiprows = 0 # start from 0 (inclusive)
    # total number of rows
    num_lines = int(subprocess.check_output("wc -l "+file_path, shell=True).split()[0]) - 1
    print("Processing {} ({} lines): ".format(file_path, num_lines))

    lemmatizer = WordNetLemmatizer()
    while (skiprows < num_lines):
        print("{} - {}:".format(skiprows, skiprows+row_per_time), end=" ")
        # read in data
        raw_data = pd.read_csv(file_path, usecols=[2, 3, 4, 5], skiprows=skiprows, nrows=row_per_time)
        raw_data.columns = ['topic', 'question_title', 'question_content', 'best_answer']
        # get labels
        labels = raw_data.topic.values

        # get corpus
        corpus = []
        raw_data = raw_data.replace (np.nan, '.')
        raw_data['question_title'] = raw_data['question_title'].astype('string')
        raw_data['question_content'] = raw_data['question_content'].astype('string')
        raw_data['best_answer'] = raw_data['best_answer'].astype('string')
        for i in range(len(raw_data)):
            if (i+1) % 500 == 0:
                print(i+1, end=' ')
            sentence = raw_data.iloc[i,1] + " " + raw_data.iloc[i,2] + " " + raw_data.iloc[i,3]
            sentence = lemmatize_sentence(sentence)
            corpus.append(sentence)
        # save files
        save_obj(corpus, "{}_corpus_{}".format(output_prefix, skiprows))
        save_obj(labels, "{}_labels_{}".format(output_prefix, skiprows))
        # increament skiprows
        skiprows += row_per_time
        print()
        
    print("Finished!")

In [6]:
# file_path = os.path.join("raw_data", "test.csv")
# output_prefix = os.path.join("lemmatized_data", "test")
# row_per_time = 5000
# preprocessing_lemmatize(file_path, output_prefix, row_per_time)

In [7]:
# file_path = os.path.join("raw_data", "train.csv")
# output_prefix = os.path.join("lemmatized_data", "train")
# row_per_time = 5000
# preprocessing_lemmatize(file_path, output_prefix, row_per_time)

# Document term matrix (DTW)

In [8]:
def load_lemmatized_sentences(filepath_prefix, start_file, end_file, row_per_time):
    """Load lemmatized sentences"""
    labels = np.array([])
    sentences = np.array([])
    curr_file = 0
    while(curr_file <= end_file):
        # load labels
        label_filepath = filepath_prefix + "_labels_" + str(start_file)
        curr_labels = load_obj(label_filepath)
        labels = np.concatenate([labels, curr_labels])
        # load lemmatized sentences
        corpus_filepath = filepath_prefix + "_corpus_" + str(start_file)
        curr_sentences = np.array(load_obj(corpus_filepath))
        sentences = np.concatenate([sentences, curr_sentences])
        # increment file number
        curr_file += row_per_time
    return [sentences, labels]

In [9]:
# data = [[i, sentences[i], sentences[i], labels[i]] for i in range(len(labels))]
# data = pd.DataFrame(data, columns=['ID', 'text', 'selected_text', 'labels'])
# data.to_csv("data.csv", index=False)

In [10]:
# data = [str(int(labels[i])) + '\t' + sentences[i].replace(' ', ',') + '\n' for i in range(len(labels))]
# with open('test.tab', 'w') as fp:
#     for i in range(len(data)):
#         fp.write(data[i])

In [11]:
filepath_prefix = os.path.join("lemmatized_data", "test")
start_file = 0
end_file = 0 #55000
row_per_time = 5000
[sentences, labels] = load_lemmatized_sentences(filepath_prefix, start_file, end_file, row_per_time)
sentences = np.array(sentences)[:, np.newaxis]
labels = labels.astype(int)

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
X = sentences
y = labels
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
X_train_str = X_train_str.reshape(-1)
X_test_str = X_test_str.reshape(-1)

In [13]:
ngram_range, min_df, max_df = (1, 2), 0.005, 0.25
stopwords = nltk.corpus.stopwords.words("english")
vectorizer = CountVectorizer(input='content', decode_error='ignore',
                             strip_accents='ascii', lowercase=True,
                             stop_words=stopwords, token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_features=1000,
                             max_df=max_df, min_df=min_df, ngram_range=ngram_range)
X_train_sparse = vectorizer.fit_transform(X_train_str)
X_test_sparse = vectorizer.transform(X_test_str)

In [22]:
from sklearn.svm import SVC
svc = SVC(C=1.0, kernel="rbf")
clf = svc.fit(X_train_sparse.toarray(), y_train)
print("train accuracy:", clf.score(X_train_sparse.toarray(), y_train))
print("test accuracy:", clf.score(X_test_sparse.toarray(), y_test))

train accuracy: 0.64375
test accuracy: 0.411


In [25]:
X_train_sparse.toarray().shape

(4000, 1000)

In [28]:
from art.attacks.evasion import FastGradientMethod
from art.estimators.classification import SklearnClassifier
# !pip install --upgrade numpy==1.20

min_pixel_value = np.amin(X_train_sparse)
max_pixel_value = np.amax(X_train_sparse)
classifier = SklearnClassifier(model=clf, clip_values=(min_pixel_value, max_pixel_value))

# Conduct Fast Gradient Attack on X with strengh epsilon
def fgAttack(X, epsilon):
    attack = FastGradientMethod(estimator=classifier, eps=epsilon)
    X_adv = attack.generate(x=X)
    return X_adv

In [29]:
X_test_sparse.toarray()[:2, :]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
# Generate adversarial test examples
X_test_adv = fgAttack(X_test_sparse.toarray()[:2, :], 3)

# Generate adversarial train examples
# X_train_adv = fgAttack(X_train_sparse.toarray(), 3)

In [31]:
X_test_adv

array([[0., 3., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 3.]], dtype=float32)

0.8314