<a href="https://colab.research.google.com/github/Nandana-Rajesh/Feature-Extraction/blob/main/re_word_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regular Expressions

In [1]:
#import regular expression library
import re

In [2]:
#some text to work with
text = '''DUK was established by the Government of Kerala by upgrading the Indian Institute of Information Technology & Management-Kerala (IIITM-K), which was established by the Government in the year 2000. DUK will commence the application process for MSc, MTech, MBA and PG Diploma programs for the academic year 2023 – 2024 in the month of March-May. As a student on these programs, you will form a part of an exciting learning environment in a verdant campus in Technocity, Thiruvananthapuram. Duk is a great University'''

In [3]:
#finding all the occurrences of DUK
#The findall() function takes two parameters, the first is the pattern being searched, in our case, duk_occ and the second parameter is text we are searching through, in our case, text.
#As you can see, this function returns all the non-overlapping matches of the pattern which is in the duk_occ variables, from the second parameter text.
#Please note that findall() is case sensitive
duk_occ = "DUK"
re.findall(duk_occ, text)

['DUK', 'DUK']

In [4]:
#Lets see this example, where the case is ignored.
duk_occ = "DUK"
re.findall(duk_occ,text,flags=re.IGNORECASE)

['DUK', 'DUK', 'Duk']

In [5]:
#We can also search multiple patterns and extract all occurrences of those patterns.
duk_kerala="DUK|Kerala"
re.findall(duk_kerala,text,flags=re.IGNORECASE)

['DUK', 'Kerala', 'Kerala', 'DUK', 'Duk']

In [6]:
#Extract words that only contain alphabets
gifts = "\
Basketball    2    25.63\
Tshirt     4   53.92\
Sneakers    1    30.58\
Mask    10   80.54\
GiftCard    2    50.00"

In [7]:
#pattern for only words
words = '[a-z]+'
re.findall(words,gifts,flags=re.IGNORECASE)

['Basketball', 'Tshirt', 'Sneakers', 'Mask', 'GiftCard']

In [8]:
#Extracting all occurrences of numbers
news = '''Sixty-six undergraduate students from an urban college in New York City participated in this study. Students participated in this research study as part of a requirement for a class. Nineteen participants were excluded from the study for meeting one or more exclusion criteria including not completing the sentence unscrambling task, not providing ratings for each of the traits, or failing either one of the two attention checks, leaving the total number of eligible participants at forty-seven. Participants included 36 women aged 18 to 52 years old (M = 20.52, SD = 6.97), 10 men aged 18 to 28 years old (M = 23.5, SD = 12.36), and one individual who did not disclose their sex'''

numbers="\d+"
re.findall(numbers,news)

['36',
 '18',
 '52',
 '20',
 '52',
 '6',
 '97',
 '10',
 '18',
 '28',
 '23',
 '5',
 '12',
 '36']

In [9]:
#By setting our pattern to \d, this signifies to one digit, while the + operator will include repetitions of digits. As you can see from our text, we also have decimals, but from our output they were separated by the “.” We can correct this by using the following regular expression:
all_numbers="\d+\.*?\d+"
re.findall(all_numbers,news)

['36', '18', '52', '20.52', '6.97', '10', '18', '28', '23.5', '12.36']

In [10]:
txt = "DUK is in Thiruvananthapuram"
x = re.search("^DUK.*puram$", txt)
print(x)

<re.Match object; span=(0, 28), match='DUK is in Thiruvananthapuram'>


In [11]:
txt = "DUK is in Thiruvananthapuram"
x = re.findall("tha", txt)
print(x)

['tha']


In [12]:
txt = "DUK is in Thiruvananthapuram"
x = re.findall("Kottayam", txt)
print(x)

[]


In [13]:
txt = "DUK is in Thiruvananthapuram"
x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())

The first white-space character is located in position: 3


In [14]:
txt = "DUK is in Thiruvananthapuram"
x = re.split("\s", txt)
print(x)

['DUK', 'is', 'in', 'Thiruvananthapuram']


In [15]:
txt = "DUK is in Thiruvananthapuram"
x = re.split("\s", txt, 1)
print(x)

['DUK', 'is in Thiruvananthapuram']


In [16]:
txt = "DUK is in Thiruvananthapuram"
x = re.sub("\s", "2023", txt)
print(x)

DUK2023is2023in2023Thiruvananthapuram


In [17]:
txt = "DUK is in Thiruvananthapuram"
x = re.sub("\s", "2023", txt, 2)
print(x)

DUK2023is2023in Thiruvananthapuram


In [18]:
txt = "DUK is in Thiruvananthapuram"
x = re.search("Thir", txt)
print(x) #this will print an object

<re.Match object; span=(10, 14), match='Thir'>


In [19]:
txt = "DUK is in Thiruvananthapuram"
x = re.search(r"\bT\w+", txt)
print(x.group())

Thiruvananthapuram


In [20]:
#extracting emails from text
text = "Please contact Digital University Kerala at contact@duk.ac.in for further information."+\
        "You can also give feedback at feedback@duk.ac.in"


emails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", text)
print(emails)

['contact@duk.ac.in', 'feedback@duk.ac.in']


In [21]:
#Validate phone number
# Validate phone number
import re

validate_phone_number_pattern = "^\\+?[1-9][0-9]{7,14}$"
re.match(validate_phone_number_pattern, "+12223334444") # Returns Match object

# Extract phone number from a string
extract_phone_number_pattern = "\\+?[1-9][0-9]{7,14}"
re.findall(extract_phone_number_pattern, 'You can reach me out at +9197343323 and +56667778888') # returns ['+12223334444', '+56667778888']

['+9197343323', '+56667778888']

In [22]:
#lets scrape some real-data and extract phone numbers using Regular Expressions
import urllib.request
import re

In [23]:
url = "http://www.summet.com/dmsi/html/codesamples/addresses.html"

response = urllib.request.urlopen(url)

html = response.read()



htmlStr = html.decode()

pdata = re.findall("\(\d{3}\) \d{3}-\d{4}", htmlStr)

for item in pdata:
    print(item)

(257) 563-7401
(372) 587-2335
(786) 713-8616
(793) 151-6230
(492) 709-6392
(654) 393-5734
(404) 960-3807
(314) 244-6306
(947) 278-5929
(684) 579-1879
(389) 737-2852
(660) 663-4518
(608) 265-2215
(959) 119-8364
(468) 353-2641
(248) 675-4007
(939) 353-1107
(570) 873-7090
(302) 259-2375
(717) 450-4729
(453) 391-4650
(559) 104-5475
(387) 142-9434
(516) 745-4496
(326) 677-3419
(746) 679-2470
(455) 430-0989
(490) 936-4694
(985) 834-8285
(662) 661-1446
(802) 668-8240
(477) 768-9247
(791) 239-9057
(832) 109-0213
(837) 196-3274
(268) 442-2428
(850) 676-5117
(861) 546-5032
(176) 805-4108
(715) 912-6931
(993) 554-0563
(357) 616-5411
(121) 347-0086
(304) 506-6314
(425) 288-2332
(145) 987-4962
(187) 582-9707
(750) 558-3965
(492) 467-3131
(774) 914-2510
(888) 106-8550
(539) 567-3573
(693) 337-2849
(545) 604-9386
(221) 156-5026
(414) 876-0865
(932) 726-8645
(726) 710-9826
(622) 594-1662
(948) 600-8503
(605) 900-7508
(716) 977-5775
(368) 239-8275
(725) 342-0650
(711) 993-5187
(882) 399-5084
(287) 755-

# Word Tokenizer

In [24]:
#install NLTK library
!pip install nltk



In [28]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
text = '''DUK was established by the Government of Kerala by upgrading the Indian Institute of Information Technology & Management-Kerala (IIITM-K), which was established by the Government in the year 2000. DUK will commence the application process for MSc, MTech, MBA and PG Diploma programs for the academic year 2023 – 2024 in the month of March-May. As a student on these programs, you will form a part of an exciting learning environment in a verdant campus in Technocity, Thiruvananthapuram. Duk is a great University'''
print(word_tokenize(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['DUK', 'was', 'established', 'by', 'the', 'Government', 'of', 'Kerala', 'by', 'upgrading', 'the', 'Indian', 'Institute', 'of', 'Information', 'Technology', '&', 'Management-Kerala', '(', 'IIITM-K', ')', ',', 'which', 'was', 'established', 'by', 'the', 'Government', 'in', 'the', 'year', '2000', '.', 'DUK', 'will', 'commence', 'the', 'application', 'process', 'for', 'MSc', ',', 'MTech', ',', 'MBA', 'and', 'PG', 'Diploma', 'programs', 'for', 'the', 'academic', 'year', '2023', '–', '2024', 'in', 'the', 'month', 'of', 'March-May', '.', 'As', 'a', 'student', 'on', 'these', 'programs', ',', 'you', 'will', 'form', 'a', 'part', 'of', 'an', 'exciting', 'learning', 'environment', 'in', 'a', 'verdant', 'campus', 'in', 'Technocity', ',', 'Thiruvananthapuram', '.', 'Duk', 'is', 'a', 'great', 'University']


# Sentence Tokenizer

In [29]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(text))

['DUK was established by the Government of Kerala by upgrading the Indian Institute of Information Technology & Management-Kerala (IIITM-K), which was established by the Government in the year 2000.', 'DUK will commence the application process for MSc, MTech, MBA and PG Diploma programs for the academic year 2023 – 2024 in the month of March-May.', 'As a student on these programs, you will form a part of an exciting learning environment in a verdant campus in Technocity, Thiruvananthapuram.', 'Duk is a great University']


In [30]:
import re

str = 'an example word:cat!!'
match = re.search(r'word:\w\w\w', str)
# If-statement after search() tests if it succeeded
if match:
  print('found', match.group()) ## 'found word:cat'
else:
  print('did not find')

found word:cat
