In [1]:
import pandas as pd
import numpy as np

In [2]:
terms = pd.read_csv("data/Standardised terms.csv")
terms

Unnamed: 0,Optimal performance
0,Utilise resources
1,Enhance productivity
2,Conduct an analysis
3,Maintain a high standard
4,Implement best practices
5,Ensure compliance
6,Streamline operations
7,Foster innovation
8,Drive growth
9,Leverage synergies


In [3]:
with open('data/sample_text.txt', 'r') as f:
    sample_text = f.read()
print(sample_text)

In today's meeting, we discussed a variety of issues affecting our department. The weather was unusually sunny, a pleasant backdrop to our serious discussions. We came to the consensus that we need to do better in terms of performance. Sally brought doughnuts, which lightened the mood. It's important to make good use of what we have at our disposal. During the coffee break, we talked about the upcoming company picnic. We should aim to be more efficient and look for ways to be more creative in our daily tasks. Growth is essential for our future, but equally important is building strong relationships with our team members. As a reminder, the annual staff survey is due next Friday. Lastly, we agreed that we must take time to look over our plans carefully and consider all angles before moving forward. On a side note, David mentioned that his cat is recovering well from surgery.


In [4]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download("punkt")

input_text = sample_text
def preparing_sentence(sentences):
    sentences = [sentence.lower().strip() for sentence in sentences]
    sentences = [re.sub(r'[^a-z0-9\s]', ' ', sentence) for sentence in sentences]
    sentences = [[word for word in word_tokenize(sentence) if word not in set(stopwords.words('english'))] for sentence in sentences]
    return sentences

sentences = sent_tokenize(input_text)
sentences = preparing_sentence(sentences)
print(sentences)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\magad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\magad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['today', 'meeting', 'discussed', 'variety', 'issues', 'affecting', 'department'], ['weather', 'unusually', 'sunny', 'pleasant', 'backdrop', 'serious', 'discussions'], ['came', 'consensus', 'need', 'better', 'terms', 'performance'], ['sally', 'brought', 'doughnuts', 'lightened', 'mood'], ['important', 'make', 'good', 'use', 'disposal'], ['coffee', 'break', 'talked', 'upcoming', 'company', 'picnic'], ['aim', 'efficient', 'look', 'ways', 'creative', 'daily', 'tasks'], ['growth', 'essential', 'future', 'equally', 'important', 'building', 'strong', 'relationships', 'team', 'members'], ['reminder', 'annual', 'staff', 'survey', 'due', 'next', 'friday'], ['lastly', 'agreed', 'must', 'take', 'time', 'look', 'plans', 'carefully', 'consider', 'angles', 'moving', 'forward'], ['side', 'note', 'david', 'mentioned', 'cat', 'recovering', 'well', 'surgery']]


In [5]:
standardized_phrases = list(terms['Optimal performance'].values)
standardized_phrases = [s.lower() for s in standardized_phrases]
standardized_phrases

['utilise resources',
 'enhance productivity',
 'conduct an analysis',
 'maintain a high standard',
 'implement best practices',
 'ensure compliance',
 'streamline operations',
 'foster innovation',
 'drive growth',
 'leverage synergies',
 'demonstrate leadership',
 'exercise due diligence',
 'maximize stakeholder value',
 'prioritise tasks',
 'facilitate collaboration',
 'monitor performance metrics',
 'execute strategies',
 'gauge effectiveness',
 'champion change']

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
standardized_matrix = vectorizer.fit_transform(standardized_phrases)

for snt in sentences:
    sentence_vector = vectorizer.transform([' '.join(snt)])
    similarity_scores = cosine_similarity(sentence_vector, standardized_matrix)
    best_match_index = similarity_scores.argmax()
    best_match = standardized_phrases[best_match_index]

    if similarity_scores[0][best_match_index] > 0.5:
        print(f"Replace '{' '.join(snt)}' with '{best_match}'")

Replace 'came consensus need better terms performance' with 'monitor performance metrics'
Replace 'aim efficient look ways creative daily tasks' with 'prioritise tasks'
Replace 'growth essential future equally important building strong relationships team members' with 'drive growth'
Replace 'reminder annual staff survey due next friday' with 'exercise due diligence'
