In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher

# Ensure the necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Function to tokenize and clean text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_words = [w for w in word_tokens if w.isalpha() and w not in stop_words]
    return " ".join(filtered_words)

# Function to compare two reports
def compare_reports(past_report, current_report):
    # Preprocess the reports
    past_clean = preprocess_text(past_report)
    current_clean = preprocess_text(current_report)
    
    # Use SequenceMatcher to find the similarity ratio
    similarity_ratio = SequenceMatcher(None, past_clean, current_clean).ratio()
    
    # Vectorize the texts using TF-IDF
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([past_clean, current_clean])
    
    # Extract key phrases
    feature_names = vectorizer.get_feature_names_out()
    dense = vectors.todense()
    denselist = dense.tolist()
    
    # Calculate changes
    strength_keywords = []
    weakness_keywords = []
    for word in feature_names:
        past_value = denselist[0][feature_names.tolist().index(word)]
        current_value = denselist[1][feature_names.tolist().index(word)]
        if current_value > past_value:
            strength_keywords.append(word)
        elif current_value < past_value:
            weakness_keywords.append(word)
    
    # Output the comparison
    output = f"Similarity Ratio between reports: {similarity_ratio*100:.2f}%\n"
    output += "\nStrengths in the current report:\n"
    output += ", ".join(strength_keywords) if strength_keywords else "No significant improvements."
    
    output += "\n\nWeaknesses in the current report:\n"
    output += ", ".join(weakness_keywords) if weakness_keywords else "No significant weaknesses."
    
    return output

# Example input
past_report = """
The institution had strong infrastructure but lacked in faculty training programs. The classrooms were well-maintained, but the equipment in labs was outdated.
"""
current_report = """
The infrastructure remains strong, and the faculty training programs have improved. However, the classrooms now show signs of wear, and the lab equipment is still outdated.
"""

# Compare the reports
result = compare_reports(past_report, current_report)
print(result)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...


Similarity Ratio between reports: 64.63%

Strengths in the current report:
however, improved, lab, remains, show, signs, still, wear

Weaknesses in the current report:
classrooms, equipment, faculty, infrastructure, institution, labs, lacked, outdated, programs, strong, training


[nltk_data]   Unzipping corpora\stopwords.zip.
