In [1]:
import pandas as pd 
import re
import os
from collections import Counter

print("Libraries imported successfully.") #pandas for data manipulation and reading csv files
                                          #Re for text patterns
                                          #OS for file paths
                                          #Counter for word frequencys

Libraries imported successfully.


In [2]:
def load_text_content(filepath):
    #Load text content from a specified file
    try:
        with open(filepath, 'r', encoding='latin-1') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None

def load_keywords(filepath):
    #Load keywords from a CSV file into a list. Try semicolon and comma delimiters
    try:
        df = pd.read_csv(filepath, delimiter=';', encoding='latin-1')
        
        if 'Keyword' not in df.columns:
            df = pd.read_csv(filepath, delimiter=',', encoding='latin-1')

        if 'Keyword' in df.columns:
            return df['Keyword'].dropna().astype(str).tolist()
        else:
            print(f"Error: 'Keyword' column not found in {filepath}. Check column header and delimiter.")
            return []
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return []

def detect_page_type(text):
    #Detect if the page is a 'Cost Page' or 'City Page' based on content
    text_lower = text.lower()
    if 'kost' in text_lower or 'prijs' in text_lower or 'wat kost' in text_lower:
        return "Cost Page"
    # Check for common Dutch city names
    elif any(city in text_lower for city in ['amsterdam', 'haarlem', 'utrecht', 'rotterdam']):
        return "City Page"
    return "Unknown"

print("Helper functions defined.")

Helper functions defined.


In [7]:
#The functions according to checklist items
def check_page_title(text, main_keyword):
    #Analyze the page title based on the SEO checklist
    match = re.search(r'title:\s*(.*)', text, re.IGNORECASE)
    title = match.group(1).strip() if match else ""
    
    suggestions = []
    score = 100

    if not title:
        suggestions.append("Page title is missing.")
        return {"score": 0, "suggestions": suggestions}

    #Check 1: Main keyword at the start
    if not title.lower().startswith(main_keyword.lower()):
        suggestions.append(f"Title should start with the main keyword '{main_keyword}'.")
        score -= 25
        
    #Check 2: Length (ideal: 50-60 chars)
    if len(title) > 65 or len(title) < 40:
        suggestions.append(f"Title length is {len(title)} characters. Aim for 40-65.")
        score -= 25

    #Check 3: Pipe separator
    if ' | ' not in title:
        suggestions.append("Use ' | ' to separate sections in the title.")
        score -= 25
        
    #Check 4: Word repetition
    words = re.findall(r'\b\w+\b', title.lower())
    word_counts = Counter(words)
    if any(count > 1 for word, count in word_counts.items() if len(word) > 3):
        repeated = [word for word, count in word_counts.items() if count > 1 and len(word) > 3]
        suggestions.append(f"Avoid repeating words in the title. Repeated: {', '.join(repeated)}.")
        score -= 25

    return {"score": max(0, score), "suggestions": suggestions, "title": title}


def check_meta_description(text, main_keyword):
    #Analyze the meta description
    match = re.search(r'meta:\s*(.*)', text, re.IGNORECASE)
    meta = match.group(1).strip() if match else ""
    
    suggestions = []
    score = 100

    if not meta:
        suggestions.append("Meta description is missing.")
        return {"score": 0, "suggestions": suggestions}

    #Check 1: Length (ideal: 120-155 chars)
    if len(meta) > 160 or len(meta) < 100:
        suggestions.append(f"Meta description length is {len(meta)} chars. Aim for 100-160.")
        score -= 30
        
    # Check 2: Main keyword presence
    if main_keyword.lower() not in meta.lower():
        suggestions.append(f"Meta description should include the main keyword '{main_keyword}'.")
        score -= 40
        
    # Check 3: Call-to-Action (CTA)
    cta_words = ['ontdek', 'vergelijk', 'bespaar', 'lees', 'vind']
    if not any(cta.lower() in meta.lower() for cta in cta_words):
        suggestions.append("Include a Call-to-Action (e.g., 'ontdek', 'vergelijk').")
        score -= 30

    return {"score": max(0, score), "suggestions": suggestions, "meta": meta}


def check_headings_and_keywords(text, keywords):
    #Analyze H1, subheadings, and keyword usage
    suggestions = []
    score = 100
    
    # Assume first non-meta/title line is H1
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    content_lines = [line for line in lines if not line.lower().startswith(('meta:', 'title:'))]
    h1 = content_lines[0] if content_lines else ""
    
    # Check 1: H1 matches main keyword
    if not h1:
        suggestions.append("H1 heading is missing.")
        score = 0
    elif h1.lower() != keywords[0].lower():
        suggestions.append(f"H1 should match the main keyword exactly. H1: '{h1}', Keyword: '{keywords[0]}'.")
        score -= 30

    # Check 2: Keyword Density (Keyword Stuffing)
    text_lower = text.lower()
    total_words = len(re.findall(r'\b\w+\b', text_lower))
    main_keyword_count = len(re.findall(r'\b' + re.escape(keywords[0].lower()) + r'\b', text_lower))
    density = (main_keyword_count / total_words) * 100 if total_words > 0 else 0
    
    if density > 2.5:
        suggestions.append(f"Main keyword density is {density:.1f}%, which is high. Avoid keyword stuffing.")
        score -= 40
    elif density < 0.5:
        suggestions.append(f"Main keyword density is low ({density:.1f}%). Use it more naturally.")
        score -= 20
        
    # Check 3: Subheading usage
    # Assume lines shorter than 80 chars without a period are subheadings
    subheadings = [line for line in content_lines[1:] if len(line) < 80 and not line.endswith('.')]
    if len(subheadings) < 3:
        suggestions.append(f"Too few subheadings ({len(subheadings)} found). Use at least 3 to structure the content.")
        score -= 30

    return {"score": max(0, score), "suggestions": suggestions, "h1": h1, "keyword_density": f"{density:.2f}%"}


def check_content_quality_and_links(text):
    #Analyze general quality checks like links and CTAs
    suggestions = []
    score = 100

    # Check 1: Internal Linking
    internal_links = re.findall(r'https?://[^\s]*trustoo[^\s]*', text, re.IGNORECASE)
    if len(internal_links) == 0:
        # Check for simple text mentions if no full URL is found
        if 'top 10' in text.lower():
             pass # Assume it's linked
        else:
            suggestions.append("No internal links to a Top 10 page were found.")
            score -= 40
    
    #Check 2: Value & CTAs
    cta_phrases = ['vraag nu', 'vrijblijvend offertes', 'vergelijk offertes']
    if not any(phrase.lower() in text.lower() for phrase in cta_phrases):
        suggestions.append("Include a clear CTA to request or compare quotes.")
        score -= 30
        
    #Check 3: Readability - Short paragraphs
    paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 1]
    long_paragraphs = [p for p in paragraphs if len(p.split()) > 150]
    if len(long_paragraphs) > 0:
        suggestions.append(f"{len(long_paragraphs)} paragraphs are too long. Keep them short and scannable.")
        score -= 30

    return {"score": max(0, score), "suggestions": suggestions}

def check_formatting_rules(text):
    #Analyze the text for specific formatting guidelines
    suggestions = []
    score = 100
    
    # Check: Price Formatting (€ 1.200,-)
    # Find all occurrences of the euro symbol
    euro_matches = re.finditer(r'€', text)
    incorrect_prices = []
    
    for match in euro_matches:
        # Get the 20 characters following the € symbol to check the format
        context = text[match.start():match.start() + 20]
        # Regex to check for the CORRECT format: €<space><number>
        correct_format_regex = r"€\s[\d\.,-]+"
        if not re.match(correct_format_regex, context):
            # If the format is incorrect, add to the suggestion
            incorrect_prices.append(context.split()[0])

    if incorrect_prices:
        # show up to 3 examples to keep the report clean
        suggestions.append(f"Found {len(incorrect_prices)} pricing formats that may be incorrect. Expected format: '€ 1.200,-'. Examples found: {', '.join(incorrect_prices[:3])}")
        score -= 40

    # Check 2: Percentage Formatting (30%)
    # Find numbers followed by a space and then a % sign (INCORRECT format)
    incorrect_percentage_matches = re.findall(r'\d+\s+%', text)
    if incorrect_percentage_matches:
        suggestions.append(f"Found {len(incorrect_percentage_matches)} percentage formats with a space before '%'. It should be '30%', not '30 %'.")
        score -= 30

    return {"score": max(0, score), "suggestions": suggestions}


print("Core SEO check functions defined.")

Core SEO check functions defined.


In [8]:
def evaluate_article(text, keywords):  # Main Evaluation Function
    # Run all checks and compiles a full evaluation report
    if not text or not keywords:
        return {"error": "Missing text or keywords."}
        
    page_type = detect_page_type(text)
    main_keyword = keywords[0]

    # Run all individual checks
    title_results = check_page_title(text, main_keyword)
    meta_results = check_meta_description(text, main_keyword)
    headings_results = check_headings_and_keywords(text, keywords)
    quality_results = check_content_quality_and_links(text)
    formatting_results = check_formatting_rules(text)
    
    # Combine all suggestions
    all_suggestions = (
        title_results['suggestions'] +
        meta_results['suggestions'] +
        headings_results['suggestions'] +
        quality_results['suggestions'] +
        formatting_results['suggestions']
    )
    
    # Calculate weighted overall score
    overall_score = (
        title_results['score'] * 0.30 +
        meta_results['score'] * 0.20 +
        headings_results['score'] * 0.35 +
        quality_results['score'] * 0.15 +
        formatting_results['score'] * 0.15
    )
    
    pass_fail = "Pass" if overall_score >= 65 else "Fail"
    
    # Compile the final report object
    evaluation = {
        "page_type": page_type,
        "overall_score": f"{overall_score:.0f}/100",
        "pass_fail": pass_fail,
        "top_suggestions": all_suggestions[:5], # Max 5 suggestions
        "details": {
            "Title": title_results,
            "Meta Description": meta_results,
            "Headings & Keywords": headings_results,
            "Content Quality & Links": quality_results,
            "Formatting Rules": formatting_results
        }
    }
    
    return evaluation

print("Main evaluation function defined.")

Main evaluation function defined.


In [9]:
def generate_text_report(evaluation):  #Report Generation Function
    #Format the evaluation dictionary into a human-readable string report
    if "error" in evaluation:
        return f"Could not generate report: {evaluation['error']}"

    report = f"""
========================================
   SEO PROOFREADER ANALYSIS REPORT
========================================
Page Type: {evaluation['page_type']}
Overall Score: {evaluation['overall_score']}
Recommendation: {evaluation['pass_fail']}
----------------------------------------

### TOP 5 IMPROVEMENT SUGGESTIONS ###
"""
    if not evaluation['top_suggestions']:
        report += "1. Great job! No major issues found.\n"
    else:
        for i, suggestion in enumerate(evaluation['top_suggestions'], 1):
            report += f"{i}. {suggestion}\n"

    report += "\n### DETAILED ANALYSIS ###\n"

    for category, details in evaluation['details'].items():
        report += f"\n--- {category.upper()} (Score: {details['score']:.0f}/100) ---\n"
        if not details['suggestions']:
            report += "  - OK\n"
        else:
            for suggestion in details['suggestions']:
                report += f"  - {suggestion}\n"

    report += "\n========================================\n"
    return report

print("Report generation function defined.")

Report generation function defined.


In [10]:
#Run the Tool on Sample Articles
#Test Case 1: Kostenpagina - Wat kost een thuisbatterij
print("\n--- Running Test Case 1: Kostenpagina - Wat kost een thuisbatterij ---")
article_path_1 = os.path.join('data', 'kostenpagina_thuisbatterij.txt')
keywords_path_1 = os.path.join('data', 'keywords_thuisbatterij.csv')

article_text_1 = load_text_content(article_path_1)
keywords_1 = load_keywords(keywords_path_1)

if article_text_1 and keywords_1:
    evaluation_1 = evaluate_article(article_text_1, keywords_1)
    report_1 = generate_text_report(evaluation_1)
    print(report_1)
    # Optionally save the report
    with open("report_thuisbatterij.txt", "w", encoding="utf-8") as f:
        f.write(report_1)

# Test Case 2: Kostenpagina - Schutting plaatsen
print("\n--- Running Test Case 2: Kostenpagina - Schutting plaatsen ---")
article_path_2 = os.path.join('data', 'kostenpagina_schutting.txt')
keywords_path_2 = os.path.join('data', 'keywords_schutting.csv')

article_text_2 = load_text_content(article_path_2)
keywords_2 = load_keywords(keywords_path_2)

if article_text_2 and keywords_2:
    evaluation_2 = evaluate_article(article_text_2, keywords_2)
    report_2 = generate_text_report(evaluation_2)
    print(report_2)
    with open("report_schutting.txt", "w", encoding="utf-8") as f:
        f.write(report_2)
        
#Test Case 3: Elektricien Haarlem
print("\n--- Running Test Case 3: Elektricien Haarlem ---")
article_path_3 = os.path.join('data', 'elektricien_haarlem.txt')
# Create a dummy keywords list if file doesn't exist.
keywords_path_3 = os.path.join('data', 'keywords_elektricien_haarlem.csv')
keywords_3 = load_keywords(keywords_path_3)
if not keywords_3:
    keywords_3 = ['elektricien Haarlem', 'elektra in Haarlem', '24 uurs elektricien service in Haarlem']

article_text_3 = load_text_content(article_path_3)

if article_text_3 and keywords_3:
    evaluation_3 = evaluate_article(article_text_3, keywords_3)
    report_3 = generate_text_report(evaluation_3)
    print(report_3)
    with open("report_elektricien_haarlem.txt", "w", encoding="utf-8") as f:
        f.write(report_3)

# Test Case 4: Relatietherapie Amsterdam 
print("\n--- Running Test Case 4: Relatietherapie Amsterdam ---")
article_path_4 = os.path.join('data', 'relatietherapie_amsterdam.txt')
keywords_path_4 = os.path.join('data', 'keywords_relatietherapie_amsterdam.csv')

article_text_4 = load_text_content(article_path_4)
keywords_4 = load_keywords(keywords_path_4)

if article_text_4 and keywords_4:
    evaluation_4 = evaluate_article(article_text_4, keywords_4)
    report_4 = generate_text_report(evaluation_4)
    print(report_4)
    # Save the report to a file
    with open("report_relatietherapie_amsterdam.txt", "w", encoding="utf-8") as f:
        f.write(report_4)

print("\n--- All tests completed. Reports are printed above and saved as .txt files. ---")


--- Running Test Case 1: Kostenpagina - Wat kost een thuisbatterij ---

   SEO PROOFREADER ANALYSIS REPORT
Page Type: Cost Page
Overall Score: 96/100
Recommendation: Pass
----------------------------------------

### TOP 5 IMPROVEMENT SUGGESTIONS ###
1. Meta description should include the main keyword 'wat kost een thuisbatterij'.
2. Main keyword density is low (0.5%). Use it more naturally.
3. 3 paragraphs are too long. Keep them short and scannable.

### DETAILED ANALYSIS ###

--- TITLE (Score: 100/100) ---
  - OK

--- META DESCRIPTION (Score: 60/100) ---
  - Meta description should include the main keyword 'wat kost een thuisbatterij'.

--- HEADINGS & KEYWORDS (Score: 80/100) ---
  - Main keyword density is low (0.5%). Use it more naturally.

--- CONTENT QUALITY & LINKS (Score: 70/100) ---
  - 3 paragraphs are too long. Keep them short and scannable.

--- FORMATTING RULES (Score: 100/100) ---
  - OK



--- Running Test Case 2: Kostenpagina - Schutting plaatsen ---

   SEO PROOFREAD