In [2]:
# Import the necessary library
import re

## Scenario 1: Financial Review Article Analysis 

In [6]:
scenario_1 = """The quarterly financial review of George’s Corporation presented insights into its performance
across regions. The North American division reported revenue growth of 12%, with net profits of
$4.5 million, despite a 5% increase in operational costs. Europe, on the other hand, contributed
€3.2 million in profits but faced a decline in year-over-year revenue by 8%. The Asia-Pacific
region excelled, with a remarkable 20% growth in revenue, accompanied by ¥2 billion in
operational expenses. The CFO emphasized that the company's strategic investments in green
technology totaling $1.2 million globally would ensure sustained growth. Future projections
estimate a 10% increase in revenue and net profits to rise by $5 million over the next fiscal year"""

In [7]:
# Display the text to see how it it inputted in python
scenario_1

"The quarterly financial review of George’s Corporation presented insights into its performance\nacross regions. The North American division reported revenue growth of 12%, with net profits of\n$4.5 million, despite a 5% increase in operational costs. Europe, on the other hand, contributed\n€3.2 million in profits but faced a decline in year-over-year revenue by 8%. The Asia-Pacific\nregion excelled, with a remarkable 20% growth in revenue, accompanied by ¥2 billion in\noperational expenses. The CFO emphasized that the company's strategic investments in green\ntechnology totaling $1.2 million globally would ensure sustained growth. Future projections\nestimate a 10% increase in revenue and net profits to rise by $5 million over the next fiscal year"

### 1. Monetary Analysis

In [8]:
# Define a pattern that matches currency amounts followed by either "million" or "billion."
pattern = re.compile("([$€¥]\d+(\.\d+)? (million|billion))")

In [9]:
"""Searches for all occurrences of the defined currency pattern in the string
'scenario_1' and stores the matches in the variable 'match'.""" 

match = pattern.findall(scenario_1)
match

[('$4.5 million', '.5', 'million'),
 ('€3.2 million', '.2', 'million'),
 ('¥2 billion', '', 'billion'),
 ('$1.2 million', '.2', 'million'),
 ('$5 million', '', 'million')]

In [10]:
# Creates a list of the first element of each match found in 'match',extracting the relevant monetary amounts."""  

monetary_amounts = [match[0] for match in match]
monetary_amounts

['$4.5 million', '€3.2 million', '¥2 billion', '$1.2 million', '$5 million']

In [17]:
# Categorization
categories = {
    "revenue": [],
    "profits": [],
    "costs": [],
    "investments": [],
}

for match in monetary_amounts:
    # Check for "revenue" before or after the match
    if "revenue" in scenario_1.split(match)[0][-12:]:
        categories["revenue"].append(match)
    # Check for "profits" before or after the match
    elif "profits" in scenario_1.split(match)[0][-50:] or "profits" in scenario_1.split(match)[1][:12]:
        categories["profits"].append(match)
    # Check for "costs" after the match (with a broader search range)
    elif "expenses" in scenario_1.split(match)[1][:50]:
        categories["costs"].append(match)
    # Check for "investments" before the match
    elif "investments" in scenario_1.split(match)[0][-60:]:
        categories["investments"].append(match)

# Output Results
print("Monetary Amounts Categorized:")
categories

Monetary Amounts Categorized:


{'revenue': [],
 'profits': ['$4.5 million', '€3.2 million', '$5 million'],
 'costs': ['¥2 billion'],
 'investments': ['$1.2 million']}

### 2. Trend Extraction

In [8]:
pattern_2 = re.compile("(\d+%)")

In [9]:
match_2 = pattern_2.findall(scenario_1)
match_2

['12%', '5%', '8%', '20%', '10%']

In [10]:
# Categorization
percentage_categories = {
    "growth": [],
    "decline": [],
    "projections": [],
}

""" Iterates through each match in 'match_2', checking specific keywords in
    the surrounding text of 'scenario_1' to categorize the matches into 'growth', 'decline', or 'projections'.
    """
for match in match_2:
    if "growth" in scenario_1.split(match)[0][-20:] or "growth"  in scenario_1.split(match)[1][:10]:
        percentage_categories["growth"].append(match)
    elif "decline" in scenario_1.split(match)[0][-100:]:
        percentage_categories["decline"].append(match)
    elif "projections" in scenario_1.split(match)[0][-40:]:
        percentage_categories["projections"].append(match)
        
print("\nPercentage Changes Categorized:")
print(percentage_categories)


Percentage Changes Categorized:
{'growth': ['12%', '5%', '20%'], 'decline': ['8%'], 'projections': ['10%']}


### 3. Prediction Redaction

In [11]:
'''This line replaces the phrase "Future projections\nestimate" in 'scenario_1' with
   "[Projections Redacted]" and stores the result in 'new_scenario'.'''

new_scenario = re.sub("Future projections\nestimate", "[Projections Redacted]", scenario_1)

In [12]:
print(new_scenario)

The quarterly financial review of George’s Corporation presented insights into its performance
across regions. The North American division reported revenue growth of 12%, with net profits of
$4.5 million, despite a 5% increase in operational costs. Europe, on the other hand, contributed
€3.2 million in profits but faced a decline in year-over-year revenue by 8%. The Asia-Pacific
region excelled, with a remarkable 20% growth in revenue, accompanied by ¥2 billion in
operational expenses. The CFO emphasized that the company's strategic investments in green
technology totaling $1.2 million globally would ensure sustained growth. [Projections Redacted] a 10% increase in revenue and net profits to rise by $5 million over the next fiscal year


## Scenario 2: Academic Research Abstract 

In [1]:
scenario_2 = """Banking institutions in developing economies are undergoing a massive transformation. Studies
by Ahmed & Kofi (2020), Smith et al. (2019), and Brown (2022) emphasize the role of digital
banking in enhancing customer satisfaction. These findings are consistent with Johnson's (2018)
exploration of AI-driven banking solutions. Despite these advancements, reports from Baker and
Thompson (2021) highlight persistent challenges in cybersecurity. Additionally, the latest research
(Harper et al., 2023) suggests that over 60% of banks are now investing in blockchain technology
to address these issues."""

### 1. Extracting Citations

In [3]:
pattern_2 = re.compile(r"[A-Z][a-z]+(?: & [A-Z][a-z]+)?(?: et al\.)?(?:'s)? \(\d{4}\)|\(Harper et al\., \d{4}\)")
match_2 = pattern_2.findall(scenario_2)
match_2

['Ahmed & Kofi (2020)',
 'Smith et al. (2019)',
 'Brown (2022)',
 "Johnson's (2018)",
 'Thompson (2021)',
 '(Harper et al., 2023)']

In [5]:
# Separate into single-author and multi-author references
single_author = []
multi_author = []

for match in match_2:
    # Check if the match contains conjunctions or "et al." to determine authorship
    if " & " in match or "et al." in match or " and " in match_2:
        multi_author.append(match)
    else:
        single_author.append(match)

# Print categorized references
print("Single-Author References:", single_author)
print("Multi-Author References:", multi_author)

Single-Author References: ['Brown (2022)', "Johnson's (2018)", 'Thompson (2021)']
Multi-Author References: ['Ahmed & Kofi (2020)', 'Smith et al. (2019)', '(Harper et al., 2023)']


### 2. Analyzing Trends

In [16]:
# Regex pattern for technological terms
tech_pattern = r"\b(?:digital|AI-driven|blockchain|cybersecurity)\b(?: [a-z]+)?(?: [a-z]+)?(?: [a-z]+)?"

# Find all technological terms
technological_terms = re.findall(tech_pattern, scenario_2, re.IGNORECASE)

# Clean and print results
technological_terms = [term.strip() for term in technological_terms]
print("Technological Terms:", technological_terms)

Technological Terms: ['digital', 'AI-driven banking solutions', 'cybersecurity', 'blockchain technology']


### 3. Year Analysis

In [17]:
pattern_3 = re.compile(r"(\d{4})")
years_match = pattern_3.findall(scenario_2)
years_match

['2020', '2019', '2022', '2018', '2021', '2023']

In [18]:
print(len(years_match))

6


In [19]:
sub_years = re.sub(pattern_3, "[Year Redacted]", scenario_2)

In [20]:
sub_years

"Banking institutions in developing economies are undergoing a massive transformation. Studies\nby Ahmed & Kofi ([Year Redacted]), Smith et al. ([Year Redacted]), and Brown ([Year Redacted]) emphasize the role of digital\nbanking in enhancing customer satisfaction. These findings are consistent with Johnson's ([Year Redacted])\nexploration of AI-driven banking solutions. Despite these advancements, reports from Baker and\nThompson ([Year Redacted]) highlight persistent challenges in cybersecurity. Additionally, the latest research\n(Harper et al., [Year Redacted]) suggests that over 60% of banks are now investing in blockchain technology\nto address these issues."

## Scenario 3: Transaction Audit Report 

In [21]:
scenario_3 = """*The finance team at ABC Inc. released a report detailing recent transactions: 
T20001 - Completed 
T20002 - Pending 
T20003 - Failed 
T20004 - Completed 
Additionally, the report included refund transactions such as R2001 - Approved and P3003 - 
Pending. The management highlighted that failed transactions (e.g., T20003) require immediate 
review. They also noted a spike in refunds, with over 15% of total transactions categorized as 
"Refunds."
"""

### 1. Transaction Categorization

In [22]:
report_match = re.findall("[TRP]\d+", scenario_3)
report_match

['T20001', 'T20002', 'T20003', 'T20004', 'R2001', 'P3003', 'T20003']

In [23]:
# Initialize classifications
categories_2 = {
    "Completed": [],
    "Pending": [],
    "Failed": [],
    "Other": []
}


for match in report_match:
    if "Completed" in scenario_3.split(match)[1][:15]: 
        categories_2["Completed"].append(match)
    elif "Pending" in scenario_3.split(match)[1][:15]:
        categories_2["Pending"].append(match)
    elif "Failed" in scenario_3.split(match)[1][:15]:
        categories_2["Failed"].append(match)
    elif "Approved" in scenario_3.split(match)[1][:15]:
        categories_2["Other"].append(match)


print("\nCategorized:")
print(categories_2)


Categorized:
{'Completed': ['T20001', 'T20004'], 'Pending': ['T20002', 'P3003'], 'Failed': ['T20003', 'T20003'], 'Other': ['R2001']}


In [24]:
# calculates the number of items in each category ('Completed', 'Pending', 'Failed', and 'Other') from 'categories_2' and stores the counts in respective variables.
number_of_completed = len(categories_2["Completed"])
number_of_pending = len(categories_2["Pending"])
number_of_failed = len(set(categories_2["Failed"]))
number_of_other = len(categories_2["Other"])

In [25]:
# Print the number of items in each category

print(f"The number of transactions that fall under 'Completed' is {number_of_completed}.")
print(f"The number of transactions that fall under 'Pending' is {number_of_pending}.")
print(f"The number of transactions that fall under 'Failed' is {number_of_failed}.")
print(f"The number of transactions that fall under 'Other' is {number_of_other}.")

The number of transactions that fall under 'Completed' is 2.
The number of transactions that fall under 'Pending' is 2.
The number of transactions that fall under 'Failed' is 1.
The number of transactions that fall under 'Other' is 1.


### 2. Redaction

In [26]:
redaction = re.sub("Failed", " [Under Review]", scenario_3, flags=re.I)

In [27]:
redaction

'*The finance team at ABC Inc. released a report detailing recent transactions: \nT20001 - Completed \nT20002 - Pending \nT20003 -  [Under Review] \nT20004 - Completed \nAdditionally, the report included refund transactions such as R2001 - Approved and P3003 - \nPending. The management highlighted that  [Under Review] transactions (e.g., T20003) require immediate \nreview. They also noted a spike in refunds, with over 15% of total transactions categorized as \n"Refunds."\n'

### 3. Refund Analysis

In [28]:
refund = re.findall(r"([RP]\d+)", scenario_3)

In [29]:
refund

['R2001', 'P3003']

In [30]:
print("Total number of refunds:", len(refund))

Total number of refunds: 2


## Scenario 4: Customer Communication Analysis 

In [31]:
scenario_4 = """*The customer support team at Global Banking Inc. compiled a list of queries received via email 
and web submissions: 
• Email: helpdesk@globalbanking.com 
• Email: complaints@globalbanking.com 
• Website: www.globalbanking.com/feedback 
• Website: https://secure.globalbanking.com/contact 
To improve support quality, management wants to analyze the communication channels and redact 
sensitive information before sharing the data with external partners. *"""

### 1. Extraction Tasks

In [32]:
pattern_4 = re.compile("([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")

In [33]:
email_address_match = pattern_4.findall(scenario_4)
email_address_match

['helpdesk@globalbanking.com', 'complaints@globalbanking.com']

In [34]:
pattern_5 = re.compile("(https?://[^\s]+|www\.[^\s]+)")

In [35]:
web_url_match = pattern_5.findall(scenario_4)
web_url_match

['www.globalbanking.com/feedback', 'https://secure.globalbanking.com/contact']

### 2. Redaction Tasks

In [36]:
email_redaction = re.sub(pattern_4, "[email protected]", scenario_4)
new_scenario_4 = re.sub(pattern_5, "[Website Redacted]", email_redaction)
print(new_scenario_4)

*The customer support team at Global Banking Inc. compiled a list of queries received via email 
and web submissions: 
• Email: [email protected] 
• Email: [email protected] 
• Website: [Website Redacted] 
• Website: [Website Redacted] 
To improve support quality, management wants to analyze the communication channels and redact 
sensitive information before sharing the data with external partners. *


### 3. Count Analysis

In [37]:
print("Total number of email addresses extracted:", len(email_address_match))
print("Total number of URLs extracted:", len(web_url_match))

Total number of email addresses extracted: 2
Total number of URLs extracted: 2


## Scenario 5: Blame AI for your grueling Job Interview 

In [38]:
scenario_5 = """Job interviews are laden with opportunities for humiliation. Who wants to describe their greatest 
weakness to a panel of peers? Or be made to feel like a quiz show contestant with brainteasers like 
“how many golf balls can you fit in a Boeing 747?”. Unfortunately for job seekers, the rigmarole 
is getting increasingly out of hand.  
Demands from hiring committees in the tech sector are piling up. That means more interviews but 
also more technical tests. Alongside coding evaluations come requests for essays, lengthy take
home assignments and even days spent working with existing teams. One friend in the Bay Area 
made it through multiple interviews rounds only to be presented with a final challenge to 
“entertain” the company’s leadership. There were no other instructions. She didn’t get the job.  
Recruiters will say that this is not being done to make life difficult for job seekers but because it 
is growing harder to find the right candidates. The blame, they say, lies with job seekers 
themselves. Online postings make speculative applications easy to fire off. In the UK, the Institute 
of Student Employers reported receiving a record 1.2mn applications for 17,000 graduate 
vacancies this year. Human resources software maker Workday reports that the number of global 
job applications is growing four times faster than job openings.  
This surplus includes those from candidates who are logging into AI chatbot ChatGPT to tailor 
their application with skills they may not possess. Some even try to trick recruiting software by 
writing in white text — listing requirements they lack in ways that will be invisible to the human 
eye but picked up by screening software.  
From an employer’s point of view, therefore, adding new hoops for candidates to jump through 
makes sense. AI-assisted applications can mask poor candidates whose failings are revealed in 
multiple interviews. And the likeable smooth talker who sails through in-person meetings may 
come undone by on-site tests or work trials.  
At some companies it is not enough to be good at your job, either. You need to show commitment 
to the company ethos. Amazon is known for assessing candidates on its 16 leadership principles. 
Fail to prove your customer obsession or ability to think big and you’ll find yourself back on the 
job market. The problem is that adding more interviews and tests exhaust candidates and 
interviewers and take everyone’s time away from the real work. In even more galling news, they 
may not even be productive. In 2016, Google declared that four interviews were enough to predict 
whether someone should be hired. According to the company, anything more than that had 
diminishing returns.   
Still, this rule does not count the barriers put in place prior to interviews. Young job seekers 
sometimes complain that their parents still believe it’s possible to put on a smart suit and hand a 
resume to the front desk when looking for work. In reality, the standard process already includes 
an online application, resume screening and online assessment before on-site meetings take place. 
Application tracking software like Oracle’s Taleo are used to filter out candidates before they get 
a chance to interact with anyone at a company. Unsuccessful applicants can be ghosted.  
As the process expands, so does the time it takes to secure a job. Research from US human 
resources adviser Josh Bersin put the average at 45 days. In fields like tech, it can be far longer. 
Software engineer Rohit Verma has blogged about his experience securing roles in large US tech 
companies. At Meta, he writes that it took about four months from referral to job offer.   
This would be more palatable if it wasn’t for the tech sector’s newfound love of mass lay-offs. 
After engaging in a hiring spree during the pandemic, the sector cut its workforce by an estimated 
264,000 last year, according to crowdsourced site Layoffs.fyi. So far this year companies including 
TikTok and Snap have cut over 149,000 jobs. That means some employees who engaged in these 
intensive interview processes are now out of a job. Where tech leads, other sectors tend to follow. 
Expect the hiring process to grow more excruciating in your own line of work soon enough.  
The good news is that there is always an exception. Sometimes, landing a job can be as simple as 
sending a tweet. In 2019, a 28-year-old Brit was hired to lead Tesla’s social media after he posted 
a picture of a giant ram with the caption: “Look at this absolute unit”. Still, taking Elon Musk’s 
fancy proved no more foolproof than any other hiring concept. Within a year, the social media 
manager had left the role. 
elaine.moore@ft.com  
Copyright The Financial Times Limited 2024. All rights reserved."""

### 1. Text Search and Matching

#### a. Finding Words

In [39]:
pattern_5a = re.compile("job", flags=re.I)

In [40]:
word_match = pattern_5a.findall(scenario_5)
print(word_match)

['Job', 'job', 'job', 'job', 'job', 'job', 'job', 'job', 'job', 'job', 'job', 'job', 'job', 'job', 'job']


#### b. Locating Sentences with "AI"

In [41]:
# Split text into sentences using periods as delimiters
sentences = re.split(r'(?<!\d)\.\s+', scenario_5)

In [42]:
# Find and print sentences containing the word "AI"
matching_sentences = [sentence for sentence in sentences if re.search(r'\bAI\b', sentence)]

print("Sentences containing 'AI':")
for sentence in matching_sentences:
    print(f"\n - {sentence.strip()}")

Sentences containing 'AI':

 - This surplus includes those from candidates who are logging into AI chatbot ChatGPT to tailor 
their application with skills they may not possess

 - AI-assisted applications can mask poor candidates whose failings are revealed in 
multiple interviews


#### c. Validating Email:

In [43]:
# Using regex patterns for emails in Scenario 4

email_match = re.match(pattern_4, scenario_5)
print(email_match)

None


### 2. Data Sanitization 

#### a. Removing Numbers

In [44]:
pattern_5b = re.compile("(\d+)")

In [45]:
numbers_match = pattern_5b.sub("[NUMBER]", scenario_5)
numbers_match

'Job interviews are laden with opportunities for humiliation. Who wants to describe their greatest \nweakness to a panel of peers? Or be made to feel like a quiz show contestant with brainteasers like \n“how many golf balls can you fit in a Boeing [NUMBER]?”. Unfortunately for job seekers, the rigmarole \nis getting increasingly out of hand.  \nDemands from hiring committees in the tech sector are piling up. That means more interviews but \nalso more technical tests. Alongside coding evaluations come requests for essays, lengthy take\nhome assignments and even days spent working with existing teams. One friend in the Bay Area \nmade it through multiple interviews rounds only to be presented with a final challenge to \n“entertain” the company’s leadership. There were no other instructions. She didn’t get the job.  \nRecruiters will say that this is not being done to make life difficult for job seekers but because it \nis growing harder to find the right candidates. The blame, they say, 

#### b. Redacting Emails:

In [46]:
email_address_pattern = re.compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")

In [47]:
email_address_match_02 = email_address_pattern.sub("[EMAIL REDACTED]", numbers_match)
email_address_match_02

'Job interviews are laden with opportunities for humiliation. Who wants to describe their greatest \nweakness to a panel of peers? Or be made to feel like a quiz show contestant with brainteasers like \n“how many golf balls can you fit in a Boeing [NUMBER]?”. Unfortunately for job seekers, the rigmarole \nis getting increasingly out of hand.  \nDemands from hiring committees in the tech sector are piling up. That means more interviews but \nalso more technical tests. Alongside coding evaluations come requests for essays, lengthy take\nhome assignments and even days spent working with existing teams. One friend in the Bay Area \nmade it through multiple interviews rounds only to be presented with a final challenge to \n“entertain” the company’s leadership. There were no other instructions. She didn’t get the job.  \nRecruiters will say that this is not being done to make life difficult for job seekers but because it \nis growing harder to find the right candidates. The blame, they say, 

#### c. Removing Special Characters

In [48]:
remove_special_characters = re.sub('[.,!?\'":;()\-]', " ", email_address_match_02)
remove_special_characters

'Job interviews are laden with opportunities for humiliation  Who wants to describe their greatest \nweakness to a panel of peers  Or be made to feel like a quiz show contestant with brainteasers like \n“how many golf balls can you fit in a Boeing [NUMBER] ”  Unfortunately for job seekers  the rigmarole \nis getting increasingly out of hand   \nDemands from hiring committees in the tech sector are piling up  That means more interviews but \nalso more technical tests  Alongside coding evaluations come requests for essays  lengthy take\nhome assignments and even days spent working with existing teams  One friend in the Bay Area \nmade it through multiple interviews rounds only to be presented with a final challenge to \n“entertain” the company’s leadership  There were no other instructions  She didn’t get the job   \nRecruiters will say that this is not being done to make life difficult for job seekers but because it \nis growing harder to find the right candidates  The blame  they say  

### 3. Text Extraction and Tokenization

#### a. Splitting Text by Sentences

In [49]:
print("Total number of sentences in text:", len(sentences))

Total number of sentences in text: 49


#### b. Extracting Dates

In [50]:
years_match_02 = re.findall(pattern_3, scenario_5)

print("All year refernces in text:")
print("\n",years_match_02)

All year refernces in text:

 ['2016', '2019', '2024']


#### c. Identify Capitalized Words

In [51]:
capitalized_words_pattern = r'\b[A-Z]+\b'

In [52]:
capitalized_words_match = re.findall(capitalized_words_pattern, scenario_5)
print("List of all capitalized words in text: ")
print("\n",capitalized_words_match)

List of all capitalized words in text: 

 ['UK', 'AI', 'AI', 'US', 'US']


### 4. Pattern Compilation for Reusability 

#### a. Define Patterns for Key Concepts:

##### • Extracting email addresses

In [53]:
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')

In [54]:
email_compile_match = email_pattern.findall(scenario_5)
email_compile_match

['elaine.moore@ft.com']

##### • Identifying sentences with words like "AI" or "job"

In [55]:
# Define the reusable pattern using re.compile
ai_jobs_pattern = re.compile(r'\b(?:AI|job)\b', re.IGNORECASE)

# Split text into sentences
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', scenario_5)

# Find sentences containing "AI" or "job"
matching_sentences = [sentence for sentence in sentences if ai_jobs_pattern.search(sentence)]

# Print matching sentences
for sentence in matching_sentences:
    print(sentence)

Job interviews are laden with opportunities for humiliation.
Unfortunately for job seekers, the rigmarole 
is getting increasingly out of hand.
She didn’t get the job.
 
Recruiters will say that this is not being done to make life difficult for job seekers but because it 
is growing harder to find the right candidates.
The blame, they say, lies with job seekers 
themselves.
Human resources software maker Workday reports that the number of global 
job applications is growing four times faster than job openings.
 
This surplus includes those from candidates who are logging into AI chatbot ChatGPT to tailor 
their application with skills they may not possess.
AI-assisted applications can mask poor candidates whose failings are revealed in 
multiple interviews.
 
At some companies it is not enough to be good at your job, either.

Fail to prove your customer obsession or ability to think big and you’ll find yourself back on the 
job market.
Young job seekers 
sometimes complain that their p

##### • Validating transaction-like formats

In [56]:
validating_formats_patterns = re.compile('[A-Z]\d+')

In [58]:
validating_formats_match = validating_formats_patterns.findall(scenario_5)
print(validating_formats_match)

[]


### 5. Text Statistics and Insights 

#### a. Count Instances

In [59]:
keyword_match_01 = re.findall("interview", scenario_5, flags=re.I)
print("The word 'interview' appears", len(keyword_match_01), "times in the write-up.")

The word 'interview' appears 9 times in the write-up.


In [60]:
keyword_match_02 = re.findall("application", scenario_5, flags=re.I)
print("The word 'application' appears", len(keyword_match_02), "times in the write-up.")

The word 'application' appears 7 times in the write-up.


#### b. Word Length Analysis

In [61]:
word_len_match = re.findall(r'\b\w{9,}\b', scenario_5, flags=re.I)

print('List of  all words of length greater than 8 characters:')
print('\n', word_len_match)

List of  all words of length greater than 8 characters:

 ['interviews', 'opportunities', 'humiliation', 'contestant', 'brainteasers', 'Unfortunately', 'rigmarole', 'increasingly', 'committees', 'interviews', 'technical', 'Alongside', 'evaluations', 'assignments', 'interviews', 'presented', 'challenge', 'entertain', 'leadership', 'instructions', 'Recruiters', 'difficult', 'candidates', 'themselves', 'speculative', 'applications', 'Institute', 'Employers', 'receiving', 'applications', 'vacancies', 'resources', 'applications', 'candidates', 'application', 'recruiting', 'requirements', 'invisible', 'screening', 'therefore', 'candidates', 'applications', 'candidates', 'interviews', 'companies', 'commitment', 'assessing', 'candidates', 'leadership', 'principles', 'obsession', 'interviews', 'candidates', 'interviewers', 'productive', 'interviews', 'According', 'diminishing', 'interviews', 'sometimes', 'application', 'screening', 'assessment', 'Application', 'candidates', 'Unsuccessful', 'app

#### c. Extract Quoted Texts

In [62]:
quoted_texts_pattern = re.compile(r'“([^”]+)”', flags=re.I)

In [63]:
quoted_phrases = quoted_texts_pattern.findall(scenario_5)

# Print the quoted phrases
print(quoted_phrases)

['how many golf balls can you fit in a Boeing 747?', 'entertain', 'Look at this absolute unit']
