## Inspect the Database of reddit comments from specified subreddit.

- Have a look at Arxiv papers cited.

In [1]:
import sqlite3
import json
import re
from tabulate import tabulate

In [2]:

def print_sqlite_db_contents(path: str):
    """inspect all the comments that have been sraped."""
    conn = sqlite3.connect(path)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    for table in tables:
        table_name = table[0]
        print(f"\nTable: {table_name}\n")

        # get contents of the tables
        cursor.execute(f"SELECT * FROM {table_name}")
        rows = cursor.fetchall()

        # get the column names
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()
        column_names = [column[1] for column in columns]

        # now print in a nice table using tabulate
        if rows:
            print(tabulate(rows, headers=column_names, tablefmt='grid'))
        else:
            print("No data available in this table.")

    conn.close()

print_sqlite_db_contents("../reddit.db")


Table: comments

+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [36]:
def find_arxiv_references_in_comments(path: str):
    # Connect to the database
    conn = sqlite3.connect(path)
    cursor = conn.cursor()

    # Define a regular expression pattern for Arxiv references
    arxiv_pattern = re.compile(r'arxiv\.org\/abs\/\d+\.\d+')

    # Get the list of tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    arxiv_references = []

    for table in tables:
        table_name = table[0]
        print(f"\nSearching for Arxiv references in Table: {table_name}\n")

        # Get the comments from the table
        cursor.execute(f"SELECT * FROM {table_name}")
        rows = cursor.fetchall()

        # Get the column names
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()
        column_names = [column[1] for column in columns]

        # Find and collect Arxiv references
        for row in rows:
            for column, value in zip(column_names, row):
                if isinstance(value, str) and arxiv_pattern.search(value):
                    arxiv_references.append((table_name, column, value))

    conn.close()

    # Print the Arxiv references using tabulate
    if arxiv_references:
        print("\nArxiv References Found:\n")
        print(tabulate(arxiv_references, headers=["Table", "Column", "Content"], tablefmt='grid'))

        # Save Arxiv papers to JSON
        papers = []
        for reference in arxiv_references:
            paper = {
                "Content": re.search(r'arxiv\.org\/abs\/(\d+\.\d+)', reference[2]).group(1)
            }
            papers.append(paper)
        with open("arxiv_papers.json", "w") as f:
            json.dump(papers, f, indent=4)
            print("\n Arxiv paper IDs saved to arxiv_papers.json for further analysis\n")
    else:
        print("No Arxiv references found in the comments.")


find_arxiv_references_in_comments("../reddit.db")


Searching for Arxiv references in Table: comments


Arxiv References Found:

+----------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Table    | Column   | Content                                                                                                                                                                                                                                                                                                                                                                                                                             