<a href="https://colab.research.google.com/github/MariaMuu/Thesis/blob/main/Sparql_data_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import json
import time
from urllib.parse import quote
import csv

In [None]:
class WikidataSPARQLValidator:
    def __init__(self):
        self.endpoint = "https://query.wikidata.org/sparql"
        self.headers = {
            'User-Agent': 'SPARQL-Validator/1.0 (your-email@example.com)',
            'Accept': 'application/json'
        }

    def test_query(self, sparql_query):
        """
        Test a single SPARQL query against Wikidata
        Returns: (has_results, result_count, error_message)
        """
        try:
            # Add LIMIT if not present to avoid huge results
            if 'LIMIT' not in sparql_query.upper():
                sparql_query += ' LIMIT 100'

            params = {
                'query': sparql_query,
                'format': 'json'
            }

            response = requests.get(
                self.endpoint,
                params=params,
                headers=self.headers,
                timeout=30
            )

            if response.status_code == 200:
                data = response.json()
                result_count = len(data.get('results', {}).get('bindings', []))
                has_results = result_count > 0
                return has_results, result_count, None
            else:
                return False, 0, f"HTTP {response.status_code}: {response.text[:200]}"

        except requests.exceptions.Timeout:
            return False, 0, "Query timeout"
        except requests.exceptions.RequestException as e:
            return False, 0, f"Request error: {str(e)}"
        except json.JSONDecodeError:
            return False, 0, "Invalid JSON response"
        except Exception as e:
            return False, 0, f"Unexpected error: {str(e)}"

    def validate_dataset(self, qa_pairs, delay=1.0):
        """
        Validate a list of Q&A pairs
        qa_pairs: list of dicts with 'question' and 'sparql' keys
        delay: seconds to wait between requests (be nice to Wikidata!)
        """
        results = []

        for i, pair in enumerate(qa_pairs):
            print(f"Testing query {i+1}/{len(qa_pairs)}: {pair.get('question', 'No question')[:50]}...")

            has_results, count, error = self.test_query(pair['sparql'])

            result = {
                'index': i,
                'question': pair.get('question', ''),
                'sparql': pair['sparql'],
                'has_results': has_results,
                'result_count': count,
                'error': error,
                'status': 'VALID' if has_results else 'INVALID'
            }

            results.append(result)

            # Be nice to Wikidata servers
            time.sleep(delay)

        return results

    def save_results(self, results, filename='sparql_validation_results.csv'):
        """Save validation results to CSV"""
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'index', 'question', 'sparql', 'has_results',
                'result_count', 'error', 'status'
            ])
            writer.writeheader()
            writer.writerows(results)

        print(f"Results saved to {filename}")

    def print_summary(self, results):
        """Print validation summary"""
        total = len(results)
        valid = sum(1 for r in results if r['has_results'])
        invalid = total - valid

        print(f"\n=== VALIDATION SUMMARY ===")
        print(f"Total queries: {total}")
        print(f"Valid (return results): {valid} ({valid/total*100:.1f}%)")
        print(f"Invalid (no results): {invalid} ({invalid/total*100:.1f}%)")

        # Show some examples of invalid queries
        invalid_examples = [r for r in results if not r['has_results']][:5]
        if invalid_examples:
            print(f"\nFirst few invalid queries:")
            for ex in invalid_examples:
                print(f"- Q: {ex['question'][:60]}...")
                print(f"  Error: {ex['error'] or 'No results returned'}")
                print()

In [None]:
# Example usage:
def load_csv_dataset(filename):
    """Load Q&A pairs from CSV file"""
    qa_pairs = []

    try:
        with open(filename, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)

            for i, row in enumerate(reader):
                if len(row) >= 2:  # Make sure we have at least 2 columns
                    qa_pairs.append({
                        'question': row[0].strip(),
                        'sparql': row[1].strip()
                    })
                else:
                    print(f"Warning: Row {i+1} doesn't have enough columns, skipping")

        print(f"Loaded {len(qa_pairs)} Q&A pairs from {filename}")
        return qa_pairs

    except FileNotFoundError:
        print(f"Error: File '{filename}' not found!")
        return []
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return []

In [6]:
def main():
    # Load your CSV file
    csv_filename = input("/content/thesis_prompt_train_random.csv").strip()
    if not csv_filename:
        csv_filename = "dataset.csv"

    qa_pairs = load_csv_dataset(csv_filename)

    if not qa_pairs:
        print("No data loaded. Exiting.")
        return

    # Ask if user wants to test all or just a subset
    test_all = input(f"Test all {len(qa_pairs)} queries? (y/n, default=y): ").strip().lower()

    if test_all == 'n':
        try:
            limit = int(input("How many queries to test? "))
            qa_pairs = qa_pairs[:limit]
        except ValueError:
            print("Invalid number, testing first 100...")
            qa_pairs = qa_pairs[:100]

    validator = WikidataSPARQLValidator()

    print(f"Starting SPARQL validation for {len(qa_pairs)} queries...")
    print("This may take a while - we wait 1 second between each query to be nice to Wikidata")

    results = validator.validate_dataset(qa_pairs)

    validator.print_summary(results)

    # Save results with timestamp
    import datetime
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"validation_results_{timestamp}.csv"
    validator.save_results(results, output_filename)

    # Also save just the invalid queries for easy fixing
    invalid_queries = [r for r in results if not r['has_results']]
    if invalid_queries:
        invalid_filename = f"invalid_queries_{timestamp}.csv"
        validator.save_results(invalid_queries, invalid_filename)
        print(f"Invalid queries saved separately to {invalid_filename}")

    return results

if __name__ == "__main__":
    results = main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Testing query 1030/6001: Tell me the citation of Childhood obesity: are we ...
Testing query 1031/6001: Is it true that the sublimation temperature of sta...
Testing query 1032/6001: Who had the honour of discovering general relativi...
Testing query 1033/6001: What media company is The Economist owned by?...
Testing query 1034/6001: Who is the child of the sister of Louis, Dauphin o...
Testing query 1035/6001: IS THE BASIC SALARY OF GEORGE STEHENSON LESS THAN ...
Testing query 1036/6001: What is the diplomatic relation of Denmark, that h...
Testing query 1037/6001: What is the official language of lives in Pasi Sil...
Testing query 1038/6001: In what year did Tim Hunt give a Croonian Lecture?...
Testing query 1039/6001: Mention the publishing year of Grand Theft Auto II...
Testing query 1040/6001: What is the organization that regulates Esperanto?...
Testing query 1041/6001: What are comic book series  which start with t