<a href="https://colab.research.google.com/github/LifeHashed/BostonHousePricing/blob/main/RegexMatching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing relevant libraries

In [3]:
import re

#Processing


Email and phone number redaction


In [37]:
def replace_emails_and_phones(text):
    # Regular expression patterns for emails and phone numbers
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    phone_pattern = r'\+?\d{1,4}[\s-]?\(?\d{1,4}\)?[\s-]?\d{1,4}[\s-]?\d{1,4}[\s-]?\d{1,9}'


    # Replace emails and phone numbers with 'x' of the same length
    text = re.sub(email_pattern, lambda x: 'x' * len(x.group()), text)
    text = re.sub(phone_pattern, lambda x: 'x' * len(x.group()), text)

    return text

Card no. and Aadhar redaction

In [5]:
#Luhn's algorithm is used to validate card no.
def get_digit(number):
    if number < 9:
        return number
    return number // 10 + number % 10

# Return the number of digits in d
def get_size(d):
    return len(str(d))

# Return the first k number of digits from number.
# If the number of digits in number is less than k, return number.
def get_prefix(number, k):
    num_str = str(number)
    if len(num_str) > k:
        return int(num_str[:k])
    return number

# Return true if the digit d is a prefix for number
def prefix_matched(number, d):
    return get_prefix(number, get_size(d)) == d

# Get the result from Step 2: sum of double even place digits
def sum_of_double_even_place(number):
    sum = 0
    num_str = str(number)
    for i in range(get_size(number) - 2, -1, -2):
        sum += get_digit(int(num_str[i]) * 2)
    return sum

# Return sum of odd-place digits in number
def sum_of_odd_place(number):
    sum = 0
    num_str = str(number)
    for i in range(get_size(number) - 1, -1, -2):
        sum += int(num_str[i])
    return sum

# Return true if the card number is valid using Luhn's algorithm
def luhn_check(number):
    return (get_size(number) >= 13 and get_size(number) <= 16) and \
           (prefix_matched(number, 4) or
            prefix_matched(number, 5) or
            prefix_matched(number, 37) or
            prefix_matched(number, 6)) and \
           ((sum_of_double_even_place(number) + sum_of_odd_place(number)) % 10 == 0)

  # Should return True if valid, False if not


In [6]:
def redact_card_numbers(text):
    # Regular expression pattern for card numbers (13-19 digits)
    card_pattern = r'\b(?:\d[ -]*?){13,19}\b'

    def replace_if_valid(match):
        card_number = re.sub(r'[^\d]', '', match.group())  # Strip spaces and dashes
        if luhn_check(card_number):
            return 'x' * len(match.group())
        return match.group()

    # Replace valid card numbers with 'x' of the same length
    text = re.sub(card_pattern, replace_if_valid, text)

    return text

#Validating Output

In [None]:
text = """
Hello John,

Please contact me at john.doe@example.com or +1-234-567-8901.
You can also reach me at jane_doe123@gmail.com or at (123) 456-7890.

Best regards,
Jane
"""

processed_text = replace_emails_and_phones(text)
print(processed_text)


Hello John,

Please contact me at xxxxxxxxxxxxxxxxxxxx or xxxxxxxxxxxxxxx.
You can also reach me at xxxxxxxxxxxxxxxxxxxxx or at (xxxxxxxxxxxxx.

Best regards,
Jane



##Using custom input


In [31]:
user_input = input("Enter the text: ")

Enter the text: My phone number is 9831911890


In [32]:
processed_text = replace_emails_and_phones(user_input)

# Print the result
print("\nProcessed Text:\n")
print(processed_text)


Processed Text:

My phone number is 9831911890


In [9]:
processed_text = redact_card_numbers(processed_text)

# Print the result
print("\nProcessed Text:\n")
print(processed_text)


Processed Text:

SEX 69


##Named Entity Recognition

In [10]:
import pandas as pd
import spacy
import requests
from bs4 import BeautifulSoup
nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 200)

###Taking Input and categorising it

In [11]:
content = "Trinamool Congress leader Mahua Moitra has moved the Supreme Court against her expulsion from the Lok Sabha over the cash-for-query allegations against her. Moitra was ousted from the Parliament last week after the Ethics Committee of the Lok Sabha found her guilty of jeopardising national security by sharing her parliamentary portal's login credentials with businessman Darshan Hiranandani."

doc = nlp(content)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Congress 10 18 ORG
Mahua Moitra 26 38 PERSON
the Supreme Court 49 66 ORG
the Lok Sabha 94 107 PERSON
Moitra 157 163 ORG
Parliament 184 194 ORG
last week 195 204 DATE
the Ethics Committee 211 231 ORG
Darshan Hiranandani 373 392 PERSON


###Visualisation

In [12]:
from spacy import displacy
displacy.render(doc, style="ent")

Function to replace names with 'Redacted'

In [13]:
def replace_names(text):
  doc = nlp(text)
  new_text = text
  for ent in reversed(doc.ents):
    if ent.label_ == "PERSON":
      new_text = new_text[:ent.start_char] + "x"*10 + new_text[ent.end_char:]
  return new_text


###Printing

In [14]:
redacted_content = replace_names(content)
print(redacted_content)

Trinamool Congress leader xxxxxxxxxx has moved the Supreme Court against her expulsion from xxxxxxxxxx over the cash-for-query allegations against her. Moitra was ousted from the Parliament last week after the Ethics Committee of the Lok Sabha found her guilty of jeopardising national security by sharing her parliamentary portal's login credentials with businessman xxxxxxxxxx.


##Pdf Upload

In [15]:
!pip install PyPDF2
import PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


###Reading text from pdf

In [16]:
with open('Example.pdf', 'rb') as pdf_file_obj:
    pdf_file_reader = PyPDF2.PdfReader(pdf_file_obj)

    num_of_pages = len(pdf_file_reader.pages)
    print('No. of pages : ', num_of_pages)

    pdf_text=""
    for page in range(num_of_pages):
        page = pdf_file_reader.pages[page]
        pdf_text += page.extract_text()
        print(page.extract_text())



FileNotFoundError: [Errno 2] No such file or directory: 'Example.pdf'

###Adding our various redaction features on it

In [None]:
redacted_pdf_text_names=replace_names(pdf_text)
print (redacted_pdf_text_names)

ACCEPTANCE OF Mahindra and Mahindra Limited (M&M) FULL-TIME OFFER 
 
 
Respected Sir/Ma’am, 
I am xxxxxxxxxx, a fourth-year undergraduate student of the Mechanical 
Engineering Department, NIT Durgapur. I am glad to accept the Full Time Offer 
provided to me by your esteemed organisation Mahindra and Mahindra 
Limited (M&M).  
Please accept this as my formal acceptance. I hereby attach below my contact 
details. 
Name: xxxxxxxxxxRoll No: 20ME8020 
Registration No:  20U10091 
Contact No: +91-8000214296 
Email ID: 7596ykumar@gmail.com 
Alternate xxxxxxxxxx ID: yk.20u10091@btech.nitdgp.ac.in 
 
Yours sincerely, 
xxxxxxxxxx:  
 
  
 



In [None]:
redacted_pdf_text_emails_and_phones=replace_emails_and_phones(redacted_pdf_text_names)
print (redacted_pdf_text_emails_and_phones)


ACCEPTANCE OF Mahindra and Mahindra Limited (M&M) FULL-TIME OFFER 
 
 
Respected Sir/Ma’am, 
I am xxxxxxxxxx, a fourth-year undergraduate student of the Mechanical 
Engineering Department, NIT Durgapur. I am glad to accept the Full Time Offer 
provided to me by your esteemed organisation Mahindra and Mahindra 
Limited (M&M).  
Please accept this as my formal acceptance. I hereby attach below my contact 
details. 
Name: xxxxxxxxxxRoll No: 20ME8020 
Registration No:  20Uxxxxx 
Contact No: xxxxxxxxxxxxxx 
Email ID: xxxxxxxxxxxxxxxxxxxx 
Alternate xxxxxxxxxx ID: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 
 
Yours sincerely, 
xxxxxxxxxx:  
 
  
 



#Web Integration


In [17]:
!pip install flask flask-cors

from flask import Flask, request, jsonify
from flask_cors import CORS



Initialising flask app

In [18]:
from flask import Flask, request, jsonify
from flask_cors import CORS
app = Flask(__name__)
CORS(app)

<flask_cors.extension.CORS at 0x7c3ce0797e80>

Implementing routes

In [40]:
from pyngrok import ngrok
@app.route('/redact', methods=['POST'])
def red():
    data = request.json
    input_text = data.get('text', '')
    redacted_text = replace_emails_and_phones(input_text)

    return jsonify({"redacted_text": redacted_text})

# Set your ngrok authentication token
ngrok.set_auth_token('2lfAiQSC7OPQK6E81seZziiHbEh_5Ud2g6Pk9VU2a8nwJmyY1')

# Expose the Flask app on port 5000
public_url = ngrok.connect(5000)
print(f"Public URL: {public_url}")

# Now run the Flask app
app.run(host='0.0.0.0', port=5000)




Public URL: NgrokTunnel: "https://daee-34-74-174-207.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
