# Project Description Summarizer for MentorMatrix

This notebook implements a text summarization model specifically tuned for academic project descriptions. It can generate concise summaries that highlight the key aspects of project proposals, research papers, and technical documents.

In [None]:
# Install required packages
!pip install bert-extractive-summarizer transformers datasets torch nltk scikit-learn rouge-score Flask

Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl.metadata (15 kB)
Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Installing collected packages: bert-extractive-summarizer
Successfully installed bert-extractive-summarizer-0.10.1


## Data Preparation and Preprocessing

We'll load a dataset and prepare it for training our model. We'll use both CNN/DailyMail for general summarization capabilities and add some custom academic project descriptions.

In [None]:
from transformers import pipeline
from datasets import load_dataset
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd
import re

# Ensure NLTK resources are downloaded
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)  # Needed for WordNet lemmatizer

# Load the CNN/DailyMail dataset
try:
    dataset = load_dataset("cnn_dailymail", "3.0.0")

    # Extract train and test data - use fewer samples for faster training
    train_texts = dataset["train"]["article"][:500]  
    train_summaries = dataset["train"]["highlights"][:500]

    test_texts = dataset["test"]["article"][:50]  
    test_summaries = dataset["test"]["highlights"][:50]

    print(f"Loaded {len(train_texts)} training samples and {len(test_texts)} test samples.")
except Exception as e:
    print(f"Error loading dataset: {e}")

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Converts text to lowercase, tokenizes, and applies lemmatization."""
    try:
        # Clean text - remove extra whitespace and normalize line breaks
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Extract key sections using regex patterns common in project descriptions
        sections = {}
        patterns = {
            'title': r'(?i)\b(title|project title):\s*([^\n]+)',
            'abstract': r'(?i)\b(abstract|summary):\s*([^\n]+(?:\n(?!\w+:)[^\n]+)*)',
            'problem': r'(?i)\b(problem statement|problem|challenge):\s*([^\n]+(?:\n(?!\w+:)[^\n]+)*)',
            'methodology': r'(?i)\b(methodology|proposed solution|approach):\s*([^\n]+(?:\n(?!\w+:)[^\n]+)*)',
            'tech': r'(?i)\b(tech stack|technologies|tools):\s*([^\n]+(?:\n(?!\w+:)[^\n]+)*)',
        }
        
        for key, pattern in patterns.items():
            match = re.search(pattern, text)
            if match:
                sections[key] = match.group(2).strip()
        
        # Combine extracted sections with more weight on important parts
        processed_text = text
        if sections:
            processed_text = ' '.join(sections.values())
            
        # Tokenize and lemmatize
        tokens = word_tokenize(processed_text.lower())
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(lemmatized_tokens)
    except Exception as e:
        print(f"Error processing text: {e}")
        return text  # Return original text in case of error

# Add custom project descriptions for domain-specific training
project_descriptions = [
    """Title: AI-Powered Resume Analyzer
    Abstract: A machine learning-based tool that analyzes resumes and provides feedback on improvements.
    Problem Statement: Recruiters receive thousands of resumes, making it difficult to manually analyze each one efficiently.
    Proposed Methodology: Using NLP and ML algorithms to extract key information from resumes, match them with job descriptions, and provide AI-generated suggestions.
    Tech Stack: Python, TensorFlow, Node.js, MongoDB, React""",
    
    """Title: Smart Agriculture Monitoring System
    Abstract: An IoT-based system for monitoring soil conditions and automating irrigation.
    Problem Statement: Traditional farming methods lack precision in water usage and monitoring of soil health.
    Proposed Methodology: Utilizing IoT sensors to collect real-time data on soil moisture, pH, and temperature. Machine learning algorithms predict optimal watering schedules.
    Tech Stack: Arduino, ESP32, Python, TensorFlow, React Native""",
    
    """Title: Blockchain-based Academic Credential Verification
    Abstract: A secure platform for storing and verifying academic credentials using blockchain technology.
    Problem Statement: Traditional credential verification is time-consuming and vulnerable to fraud.
    Proposed Methodology: Implementing a distributed ledger to store encrypted credential records that can be easily verified by authorized parties without central authority.
    Tech Stack: Ethereum, Solidity, Node.js, React, IPFS"""
]

project_summaries = [
    "An AI-driven resume analysis tool that uses NLP to enhance resume quality by comparing it with job descriptions and providing targeted improvement suggestions.",
    "An IoT-based smart agriculture system that monitors soil conditions and automates irrigation using sensors and machine learning to optimize water usage.",
    "A blockchain platform for secure, immutable storage and verification of academic credentials, eliminating fraud and reducing verification time."
]

# Combine general dataset with domain-specific examples
# Add more weight to project descriptions by adding them multiple times
for _ in range(5):  # Add each project description 5 times for more weight
    train_texts.extend(project_descriptions)
    train_summaries.extend(project_summaries)

print(f"Updated dataset size: {len(train_texts)} training samples")

## Train-Test Split

Splitting our data into training, validation, and test sets.

In [None]:
from sklearn.model_selection import train_test_split

# First, split into train (70%) and temp (30%)
train_texts, temp_texts, train_summaries, temp_summaries = train_test_split(
    train_texts, train_summaries, test_size=0.30, random_state=42)

# Split temp into validation (10%) and test (20%)
val_texts, test_texts, val_summaries, test_summaries = train_test_split(
    temp_texts, temp_summaries, test_size=2/3, random_state=42)  # 2/3 of 30% = 20%

print(f"Training set: {len(train_texts)} samples")
print(f"Validation set: {len(val_texts)} samples")
print(f"Test set: {len(test_texts)} samples")

## Model Training

We'll fine-tune a pre-trained PEGASUS model, which is specialized for summarization tasks.

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=1024, max_target_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.summaries = summaries
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        summary = str(self.summaries[idx])
        
        # Apply preprocessing to emphasize project-specific keywords
        text = preprocess_text(text)
        
        # Tokenize inputs
        inputs = self.tokenizer(text, max_length=self.max_input_length, 
                               padding="max_length", truncation=True)
        
        # Tokenize targets
        targets = self.tokenizer(summary, max_length=self.max_target_length,
                               padding="max_length", truncation=True)
        
        # Return as tensors
        input_ids = torch.tensor(inputs["input_ids"])
        attention_mask = torch.tensor(inputs["attention_mask"])
        target_ids = torch.tensor(targets["input_ids"])
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids
        }

# Load PEGASUS tokenizer and model - using a smaller variant for faster training
model_name = "google/pegasus-xsum"  # XSum model is smaller and better for shorter summaries
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Check if GPU is available and has enough memory
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Prepare datasets
train_dataset = SummarizationDataset(train_texts, train_summaries, tokenizer)
val_dataset = SummarizationDataset(val_texts, val_summaries, tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,  # Reduce batch size if memory issues occur
    per_device_eval_batch_size=4,
    num_train_epochs=3,  # Reduced epochs for quicker training
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
print("Starting model training...")
trainer.train()
print("Training complete!")

## Model Evaluation

Evaluating our model using ROUGE scores and examining example outputs.

In [None]:
import pickle
from rouge_score import rouge_scorer
import numpy as np

def evaluate_model(model, tokenizer, test_texts, test_summaries, num_examples=5):
    """Evaluates the summarization model on test data with ROUGE metrics."""
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    all_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for i in range(min(num_examples, len(test_texts))):
        text = test_texts[i]
        reference_summary = test_summaries[i]
        
        # Preprocess text to focus on project-specific elements
        processed_text = preprocess_text(text)
        
        # Generate summary using the model
        inputs = tokenizer(processed_text, max_length=1024, truncation=True, return_tensors="pt").to(device)
        summary_ids = model.generate(
            inputs["input_ids"], 
            max_length=150, 
            min_length=30,  # Ensure summaries aren't too short
            num_beams=4, 
            length_penalty=2.0,  # Encourage longer summaries
            early_stopping=True
        )
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
        print(f"Example {i+1}:\n{'-'*30}")
        print(f"Original Text:\n{text[:500]}..." if len(text) > 500 else f"Original Text:\n{text}")
        print(f"\nReference Summary:\n{reference_summary}")
        print(f"\nGenerated Summary:\n{generated_summary}")
        
        # Calculate ROUGE scores
        scores = scorer.score(reference_summary, generated_summary)
        print("\nROUGE Scores:")
        for rouge_type, score in scores.items():
            print(f"{rouge_type}: F1={score.fmeasure:.4f}")
            all_scores[rouge_type].append(score.fmeasure)
        
        print("=" * 80)
    
    # Calculate average scores
    print("\nAverage ROUGE Scores:")
    for rouge_type, scores_list in all_scores.items():
        if scores_list:
            print(f"Average {rouge_type}: {np.mean(scores_list):.4f}")

# Evaluate the model on a subset of the test data
print("Evaluating model on general test samples...")
evaluate_model(model, tokenizer, test_texts[:5], test_summaries[:5])

# Evaluate on project-specific examples
print("\nEvaluating model on project description samples...")
evaluate_model(model, tokenizer, project_descriptions, project_summaries)

# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump({
        'model': model,
        'tokenizer': tokenizer,
        'preprocess_fn': preprocess_text
    }, file)

print("\nModel saved successfully!")

## Testing with Custom Project Descriptions

Let's test our model with some custom project descriptions that are similar to what we might see in MentorMatrix.

In [None]:
# Test with new project descriptions
new_project_descriptions = [
    """Title: Neural Network-based Stock Market Predictor
    Abstract: This project implements a neural network to predict stock market trends using historical data and news sentiment analysis.
    Problem Statement: Stock market prediction is challenging due to its volatile nature and multiple influencing factors.
    Proposed Methodology: Combining LSTM neural networks with sentiment analysis of financial news to predict short-term market movements.
    Tech Stack: Python, TensorFlow, Keras, NLTK, Flask, React""",
    
    """Title: Secure Medical Records Management System
    Abstract: A blockchain-based platform for secure storage and sharing of medical records between healthcare providers.
    Problem Statement: Traditional medical record systems lack interoperability and have security vulnerabilities.
    Proposed Methodology: Implementing a permissioned blockchain network where healthcare providers can securely access and update patient records with proper authorization.
    Tech Stack: Hyperledger Fabric, Node.js, Express, MongoDB, React""",
    
    """Title: Mentor Matrix Platform
    Abstract: A comprehensive platform connecting students with faculty mentors for research and project collaboration.
    Problem Statement: Students often struggle to find appropriate mentors for their research interests, while faculty have limited visibility of potential mentees.
    Proposed Methodology: Creating a matching algorithm based on research interests, skills, and availability to suggest optimal student-mentor pairings.
    Tech Stack: MERN stack (MongoDB, Express.js, React, Node.js), AI for matching"""
]

# Function to summarize a new project description
def summarize_project(text, model, tokenizer, max_length=100, min_length=30):
    """Generate a concise summary for a project description"""
    processed_text = preprocess_text(text)
    
    inputs = tokenizer(processed_text, max_length=1024, truncation=True, return_tensors="pt").to(device)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for the new project descriptions
print("Testing with new project descriptions:\n")
for i, desc in enumerate(new_project_descriptions):
    print(f"Project {i+1}:\n{'-'*30}")
    print(f"Description:\n{desc}\n")
    summary = summarize_project(desc, model, tokenizer)
    print(f"Generated Summary:\n{summary}\n")
    print("=" * 80)

## Creating API for Flask Integration

Let's create a simple function that can be used in a Flask API to summarize project descriptions.

In [None]:
def create_summarizer_api_code():
    """Generate code for a Flask API to use our summarizer"""
    api_code = '''
# app.py - Flask API for Project Summarizer
from flask import Flask, request, jsonify
import pickle
import torch

app = Flask(__name__)

# Load the model
with open('model.pkl', 'rb') as file:
    model_data = pickle.load(file)
    model = model_data['model']
    tokenizer = model_data['tokenizer']
    preprocess_text = model_data['preprocess_fn']

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def summarize_project(text, max_length=100, min_length=30):
    """Generate a concise summary for a project description"""
    processed_text = preprocess_text(text)
    
    inputs = tokenizer(processed_text, max_length=1024, truncation=True, return_tensors="pt").to(device)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

@app.route('/summarize', methods=['POST'])
def api_summarize():
    """API endpoint to summarize project descriptions"""
    if not request.json or 'text' not in request.json:
        return jsonify({'error': 'Please provide text to summarize'}), 400
    
    project_text = request.json['text']
    max_length = request.json.get('max_length', 100)
    min_length = request.json.get('min_length', 30)
    
    try:
        summary = summarize_project(project_text, max_length, min_length)
        return jsonify({
            'original': project_text,
            'summary': summary
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)
'''
    
    print("Flask API code for summarizer:")
    print(api_code)
    
    # Save the API code to a file
    with open('app.py', 'w') as file:
        file.write(api_code)
    
    print("\nAPI code saved to app.py")
    print("\nTo run the API: python app.py")
    print("Then make POST requests to http://localhost:5000/summarize with JSON body: {\"text\": \"your project description\"}")

create_summarizer_api_code()

## Requirements File Generation

Let's create a requirements.txt file for this project.

In [None]:
# Generate requirements.txt
requirements = """
transformers>=4.15.0
torch>=1.10.0
nltk>=3.6.5
scikit-learn>=1.0.1
rouge-score>=0.1.2
flask>=2.0.2
pandas>=1.3.4
numpy>=1.21.4
datasets>=1.17.0
"""

with open('requirements.txt', 'w') as file:
    file.write(requirements.strip())

print("Requirements file created successfully!")
print("To use this model in the MentorMatrix application, integrate it with the summarizer.js utility.")