In [27]:
import pandas as pd
import sqlite3
import concurrent.futures
import os
from src.CrossrefRetriever import Logger  # Adjust the import according to your project structure

In [28]:
script_name = "articleAnalyzer"
log_file = f"{script_name}.log"
logger_instance = Logger(name=script_name, log_file=log_file)
logger = logger_instance.get_logger()

def concurrent_executor(func):
    """
    Decorator to execute a function concurrently.
    """
    def wrapper(*args, **kwargs):
        logger.info(f"Executing {func.__name__} concurrently.")
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(func, *args, **kwargs)
            result = future.result()
        logger.info(f"Finished executing {func.__name__}.")
        return result
    return wrapper

class ArticleAnalyzer:
    def __init__(self, db_file, table_name):
        """
        Initializes the ArticleAnalyzer with database file and table name.
        
        :param db_file: The path to the SQLite database file.
        :param table_name: The name of the table containing the article data.
        """
        self.db_file = db_file
        self.table_name = table_name
        self.articles_df = None
        self.total_cells = 0
        self.abstract_cells = 0
        self.RCT_cells = 0

    def load_data(self):
        """Loads the data from the SQLite database into a DataFrame."""
        try:
            conn = sqlite3.connect(self.db_file)
            query = f"SELECT * FROM {self.table_name}"
            self.articles_df = pd.read_sql_query(query, conn)
            conn.close()
            logger.info(f"Data loaded from table {self.table_name} in {self.db_file}.")
        except Exception as e:
            logger.error(f"Failed to load data from SQLite database. {e}")
            raise

    @concurrent_executor
    def calculate_total_cells(self):
        """Calculates the total number of cells in the DataFrame."""
        self.total_cells = self.articles_df.size
        logger.info(f"Total cells calculated: {self.total_cells}")

    @concurrent_executor
    def calculate_abstract_cells(self):
        """Calculates the number of non-null cells in the 'abstract' column."""
        self.abstract_cells = self.articles_df['abstract'].count()
        logger.info(f"Abstract cells calculated: {self.abstract_cells}")

    @staticmethod
    def contains_rct(abstract):
        """
        Checks if the abstract contains the keyword 'randomized controlled trial'.
        
        :param abstract: The abstract text to check.
        :return: True if the keyword is found, False otherwise.
        """
        return 'randomized controlled trial' in str(abstract).lower()

    @concurrent_executor
    def calculate_rct_cells(self):
        """Calculates the number of abstracts containing the keyword 'randomized controlled trial'."""
        results = list(map(self.contains_rct, self.articles_df['abstract']))
        self.RCT_cells = sum(results)
        logger.info(f"RCT cells calculated: {self.RCT_cells}")

    def analyze(self):
        """Performs the analysis by orchestrating the necessary steps."""
        self.load_data()
        self.calculate_total_cells()
        self.calculate_abstract_cells()
        self.calculate_rct_cells()

    def print_results(self):
        """Prints the results of the analysis."""
        print(f"Total cells: {self.total_cells}")
        print(f"Cells with abstracts: {self.abstract_cells}")
        print(f"Cells with RCTs: {self.RCT_cells}")

# Example usage
if __name__ == "__main__":
    db_file = 'articles.sqlite'
    table_name = 'articles'
    
    # Check if the database file exists before proceeding
    if not os.path.exists(db_file):
        logger.error(f"Database file {db_file} does not exist.")
    else:
        analyzer = ArticleAnalyzer(db_file, table_name)
        try:
            analyzer.analyze()
            analyzer.print_results()
        except Exception as e:
            logger.error(f"Analysis failed. {e}")


2024-06-09 20:35:05,249 - articleAnalyzer - INFO - Data loaded from table articles in articles.sqlite.
2024-06-09 20:35:05,249 - articleAnalyzer - INFO - Data loaded from table articles in articles.sqlite.
2024-06-09 20:35:05,249 - articleAnalyzer - INFO - Data loaded from table articles in articles.sqlite.
2024-06-09 20:35:05,249 - INFO - Data loaded from table articles in articles.sqlite.
2024-06-09 20:35:05,249 - articleAnalyzer - INFO - Data loaded from table articles in articles.sqlite.
2024-06-09 20:35:05,249 - articleAnalyzer - INFO - Data loaded from table articles in articles.sqlite.
2024-06-09 20:35:05,254 - articleAnalyzer - INFO - Executing calculate_total_cells concurrently.
2024-06-09 20:35:05,254 - articleAnalyzer - INFO - Executing calculate_total_cells concurrently.
2024-06-09 20:35:05,254 - articleAnalyzer - INFO - Executing calculate_total_cells concurrently.
2024-06-09 20:35:05,254 - INFO - Executing calculate_total_cells concurrently.
2024-06-09 20:35:05,254 - arti

Total cells: 3825000
Cells with abstracts: 146074
Cells with RCTs: 1364
