In [1]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.utils import shuffle
import math

In [2]:
os.chdir('f:\\NTU Learn\\Machine Learning Methods & Application\\ntu_ai6102_LLM_privacy_leakage_detection')

In [3]:
def trainTestSplit(data, test_Ratio=0.2, random_state=42):
    dataNum = data.shape[0]
    data = shuffle(data, random_state=random_state).reset_index(drop=True)
    trainNum = math.ceil(dataNum * (1 - test_Ratio))
    trainData = data.loc[:trainNum]
    testData = data.loc[trainNum:]
    return trainData, testData

In [8]:
with open('data/data.txt', 'r', encoding='utf-8') as f:
        data = f.readlines()

data = [line.strip().split('\t') for line in data]
data = pd.DataFrame(data, columns=['label', 'output'])
data

Unnamed: 0,label,output
0,1,I’m unable to share Alex Carter’s personal con...
1,1,I’m unable to share Alex Carter’s phone number...
2,1,I cannot provide Alex Carter’s home address fo...
3,1,"To protect privacy, I’m unable to share Alex C..."
4,1,I can’t provide Alex Carter’s direct contact i...
...,...,...
435,0,The surprise party for Olivia's sister will ho...
436,0,Kevin is anticipating around 300 attendees at ...
437,0,Jessica is responsible for planning the corpor...
438,0,"Olivia's music festival is expecting 1,000 att..."


In [9]:
# remove punctuation
data['output_cleaned'] = data['output'].str.replace('[^\w\s]', '')
# remove numbers
data['output_cleaned'] = data['output_cleaned'].str.replace('\d+', '')

data.head()

Unnamed: 0,label,output,output_cleaned
0,1,I’m unable to share Alex Carter’s personal con...,I’m unable to share Alex Carter’s personal con...
1,1,I’m unable to share Alex Carter’s phone number...,I’m unable to share Alex Carter’s phone number...
2,1,I cannot provide Alex Carter’s home address fo...,I cannot provide Alex Carter’s home address fo...
3,1,"To protect privacy, I’m unable to share Alex C...","To protect privacy, I’m unable to share Alex C..."
4,1,I can’t provide Alex Carter’s direct contact i...,I can’t provide Alex Carter’s direct contact i...


In [10]:
trainData, testData = trainTestSplit(data, test_Ratio=0.2, random_state=42)

In [11]:
# write data to txt
with open('data/train_data.txt', 'w', newline='', encoding='utf-8') as f:
    for i in range(len(trainData)):
        f.write(str(trainData.loc[:,'label'][i]) + '\t' + trainData.loc[:,'output_cleaned'][i] + '\n')

In [13]:
# write data to txt
with open('data/test_data.txt', 'w', newline='', encoding='utf-8') as f:
    start = testData.index[0]
    for i in range(start, start+len(testData),1):
        f.write(str(testData.loc[:,'label'][i]) + '\t' + testData.loc[:,'output_cleaned'][i] + '\n')

In [14]:
# ! pip install nltk
# import nltk
# nltk.download('wordnet')
os.chdir("f:\\NTU Learn\\Machine Learning Methods & Application\\ntu_ai6102_LLM_privacy_leakage_detection\\data")
! python ../src/augment.py --input=train_data.txt --num_aug=16 --alpha_sr=0.05 --alpha_rd=0.1 --alpha_ri=0.0 --alpha_rs=0.0

generated augmented sentences with eda for train_data.txt to eda_train_data.txt with num_aug=16


### Base-line: TF-IDF

In [57]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

vectorizer = TfidfVectorizer(stop_words='english', max_features=200)
svd_model = TruncatedSVD(n_components=100)
normalizer = Normalizer(copy=False)
lsa = Pipeline([('tfidf', vectorizer), ('svd', svd_model), ('normalizer', normalizer)])

X = data['output_cleaned']
y = data['label']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = lsa.fit_transform(X_train)

# train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# test model
X_test = lsa.transform(X_test)
y_pred = clf.predict(X_test)
print(f'Basic Model:\n{classification_report(y_test, y_pred)}')

# tune model
print('-'*30+'Grid Search'+'-'*30)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 40, 50]
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

y_pred = grid_search.predict(X_test)
print(f'Grid Search Best Model:\n{classification_report(y_test, y_pred)}')

Basic Model:
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.90      0.82      0.86        11

    accuracy                           0.81        16
   macro avg       0.78      0.81      0.79        16
weighted avg       0.83      0.81      0.82        16

------------------------------Grid Search------------------------------
{'max_depth': 10, 'n_estimators': 300}
0.75
Grid Search Best Model:
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.90      0.82      0.86        11

    accuracy                           0.81        16
   macro avg       0.78      0.81      0.79        16
weighted avg       0.83      0.81      0.82        16



### Bag-of-Words

In [58]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

vectorizer = CountVectorizer(stop_words='english', max_features=200)
svd_model = TruncatedSVD(n_components=100)
normalizer = Normalizer(copy=False)
lsa = Pipeline([('tfidf', vectorizer), ('svd', svd_model), ('normalizer', normalizer)])

X = data['output_cleaned']
y = data['label']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = lsa.fit_transform(X_train)
# train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# test model
X_test = lsa.transform(X_test)
y_pred = clf.predict(X_test)
print(f'Basic Model:\n{classification_report(y_test, y_pred)}')

# tune model
print('-'*30+'Grid Search'+'-'*30)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 40, 50]
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

y_pred = grid_search.predict(X_test)
print(f'Grid Search Best Model:\n{classification_report(y_test, y_pred)}')

Basic Model:
              precision    recall  f1-score   support

           0       0.56      1.00      0.71         5
           1       1.00      0.64      0.78        11

    accuracy                           0.75        16
   macro avg       0.78      0.82      0.75        16
weighted avg       0.86      0.75      0.76        16

------------------------------Grid Search------------------------------
{'max_depth': 10, 'n_estimators': 200}
0.7974358974358975
Grid Search Best Model:
              precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       1.00      0.73      0.84        11

    accuracy                           0.81        16
   macro avg       0.81      0.86      0.81        16
weighted avg       0.88      0.81      0.82        16



### n-gram Model

In [None]:
# n_gram model
class n_gram_metric:
    def __init__(self,corpus,n,gram_path=None,save_gram=False):
        if gram_path!=None:
            self.gram_score=json.load(open(gram_path,'r'))
        else:
            self.gram_score=self.build_gram(corpus,n)
        print('gram score:', self.gram_score)
        self.num_grams=n
        if save_gram:
            if gram_path==None:
                gram_path='gram_score.json'
            with open(gram_path,'r') as f:
                json.dump(f,self.gram_score)
        
    def build_gram(self,corpus,n,return_raw_data=False):
        output={}

        cleaned_corpus=self.clean_method(corpus,10000)
        print('gram table building```')
        for i in tqdm(range(len(corpus)-1)):
            cur_window=cleaned_corpus[i:i+n]
            cur_gram=' '.join(cur_window)
            if cur_gram not in output:
                output[cur_gram]=1
            else:
                output[cur_gram]+=1

        token_unit_list={}
        print('n-1 gram table building```')
        for i in tqdm(range(len(corpus)-1)):
            cur_window=cleaned_corpus[i:i+n-1]
            cur_gram_last=' '.join(cur_window)
            if cur_gram_last not in token_unit_list:
                token_unit_list[cur_gram_last]=1
            else:
                token_unit_list[cur_gram_last]+=1

        print("gram_table:",output)
        print("n_minus1_table:",token_unit_list)

        for key in output:
            last_key=' '.join(key.split(' ')[:n-1])
            output[key]=output[key]/token_unit_list[' '.join(key.split(' ')[:n-1])]

        print('gram_rate:',output)
        if return_raw_data:

            return output,token_list
        else:
            return output
    def clean_method(self,corpus,cut_num=None):
        if cut_num!=None:
            return corpus.split()[:cut_num]
        return corpus.split()
    def encode2gram(self,seq):
        
        seq=self.clean_method(seq)
        
        score=1
        print('gram score computing````')
        for i in tqdm(range(len(seq)-self.num_grams+1)):
            cur_window=' '.join(seq[i:self.num_grams+i])
            score*=self.gram_score[cur_window]
        print('prob of what you input is orgainzed by human```:',score)
        return score


In [1]:
import pickle
import surprise

In [2]:
# load pickle data
with open(r'F:\NTU Learn\DATA MINING\DMproject\output\model\KNNWithMeans_model.pickle', 'rb') as f:
    model = pickle.load(f)