In [1]:
import os

In [2]:
import shutil
import glob

In [3]:
import spacy
import argparse
import glob
import spacy
import re
import sys
import numpy as np
import ntpath
import os
import en_core_web_sm

In [4]:
from spacy.matcher import Matcher
global final_data

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
def get_files(args):
    folder_path = args.folder
    pattern = args.pattern
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if re.match(pattern, f)]
    return files

In [7]:
def read_text_file(file_to_read):
    with open(file_to_read, 'r') as f:
        data = f.read()
    return data

In [8]:
def unicode_char(word):
    return "\u2588" * len(word)

In [9]:
def redact_names(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            if ent.
            text = text.replace(ent.text, unicode_char(ent.text))
    return text

In [10]:
def redact_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["PERSON","ORG"]:
            text = text.replace(ent.text, unicode_char(ent.text))
    for token in doc:
        if token.pos_ in ["NOUN", "PROPN"]:
            text = text.replace(token.text, unicode_char(token.text))
    return text


In [11]:
import re

def redact_dates(text):
    # Define a regex pattern to match dates in various formats
    date_pattern = r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{2,4}|\d{4})\b|' \
                  r'\b(\d{4})[/-](\d{1,2})[/-](\d{1,2})\b|' \
                  r'\b(\d{1,2}) ([A-Za-z]{3,9}) (\d{2,4}|\d{4})\b|' \
                  r'\b([A-Za-z]{3,9}) (\d{1,2})[,-]? (\d{2,4}|\d{4})\b'

    # Replace all matched dates with the Unicode character
    text = re.sub(date_pattern, unicode_char('DATE'), text)

    return text


In [12]:
def redact_phones(text):
    phone_re = re.compile(r"\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{10}")
    text = phone_re.sub(unicode_char("PHONE"), text)
    return text

In [13]:
def redact_gender(text):
    gender_terms = ["he", "she", "him", "her", "his", "hers"]
    for term in gender_terms:
        text = re.sub(r"\b" + term + r"\b", unicode_char(term), text)
    return text

In [14]:
def redact_address(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "GPE" or ent.label_ == "LOC":
            text = text.replace(ent.text, unicode_char(ent.text))
    return text


In [15]:
def stats(args, text, file):
    redacted_terms = {}
    text = redact_names(text)
    text = redact_dates(text)
    text = redact_phones(text)
    text = redact_gender(text)
    text = redact_address(text)
    for word in text.split():
        if "\u2588" in word:
            if word not in redacted_terms:
                redacted_terms[word] = 1
            else:
                redacted_terms[word] += 1
    if args.stats:
        write_tostatfile(redacted_terms, len(text.split()), file, args)
    else:
        write_stdout(redacted_terms, len(text.split()))


In [16]:
def write_tostatfile(redacted_terms, count, file, args):
    folder_path = args.stats
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    with open(os.path.join(folder_path, file + ".stats"), "w") as f:
        f.write(f"Total Words: {count}\n")
        for term, count in redacted_terms.items():
            f.write(f"{term}: {count}\n")


In [17]:
def write_stdout(redacted_terms, count):
    print(f"Total Words: {count}")


In [18]:
def output(args, complete_data, files):

    if args.output == 'stdout':
        print("\n","******* Redacted data output from ", files,"file","*******","\n",complete_data)
    elif args.output != 'stderr':
        cwd = os.getcwd()
        folder_path = os.path.join(cwd,str(args.output).strip('\''))
        path=ntpath.basename(files)+ '.redacted'
        complete_path = (str(args.output).strip('\'') + '\\' + path)
        final_path = os.path.join(cwd,complete_path)
        if os.path.isdir(folder_path):
            final_file = open(final_path, "w" ,encoding="utf-8")
        else:
            os.mkdir(folder_path)
            final_file = open(final_path, "w" ,encoding="utf-8")
        final_file.write(complete_data)
        final_file.close()
    elif args.output == 'stderr':
        print("No Error Found", file = sys.stderr)

In [21]:
def main(parser):

    args=parser.parse_args()
    list_of_files = get_files(args)
    for file in list_of_files:
        if args.stats == 'stdout':
            print("\n" + "******* Stats after redacting the file", file.split('.')[0] + " *******" + '\n')
        text = read_text_file (file)
        final_data = stats(args, text, file)
        output(args, final_data, file)

if __name__ == "__main__":
    parser =argparse.ArgumentParser()
    parser.add_argument("--input",type=str,required=True,nargs='*',help="It takes the patterns of the input files")
    parser.add_argument("--names",action="store_true",help="It helps in redacting names")
    parser.add_argument("--dates",action="store_true",help="It helps in redacting dates")
    parser.add_argument("--phones",action="store_true",help="It helps in redacting phones")
    parser.add_argument("--genders",action="store_true",help="It helps in redacting genders")
    parser.add_argument("--address",action="store_true",help="It helps in redacting address")
    parser.add_argument("--output",type=str, required=True,help="It takes the output file path")
    parser.add_argument("--stats",help="It provides the stats of the redacted flags")
    main(parser)

usage: ipykernel_launcher.py [-h] --input [INPUT ...] [--names] [--dates]
                             [--phones] [--genders] [--address] --output
                             OUTPUT [--stats STATS]
ipykernel_launcher.py: error: the following arguments are required: --input, --output


SystemExit: 2