# Data Processing 
10/2/24

J. Chanenson

## Convert Table To Useful Data

In [1]:
import pandas as pd
import os

In [2]:
# Read the CSV file
df = pd.read_csv("extracted_table_data.csv")
# Make output folder
os.makedirs('reports', exist_ok=True) 

In [3]:
# Keep only the specified columns
df = df[['Title', 'Report']]

# Convert Title and Report to string
df['Title'] = df['Title'].astype(str)
df['Report'] = df['Report'].astype(str)

# Extract the integer part for Title and create a new column for original title
df['Original_Title'] = df['Title']
df['Title'] = df['Title'].str.extract(r'(\d+)-doc\.pdf')[0].astype(int)

# Split the Original_Title on " - " and take the last part (which should be the word count)
word_counts = df['Original_Title'].str.split(' - ').str[-1].str.replace(' words', '')

# Clean the word counts, replace any NaN values with '0', and convert to int
df['Words'] = word_counts.fillna('0').str.replace(',', '').astype(int)


# Optionally, drop the temporary 'Original_Title' column if you don't need it
df.drop(columns=['Original_Title'], inplace=True)

## Export Failed Papers and Papers That Trigger Review

In [None]:
# Grab Failed Reports
failed_reports = df[df['Report'] == 'Failed'].drop_duplicates()

# Save the failed reports to a CSV file
failed_reports.to_csv('reports/failed_reports.csv', index=False)

failed_reports.head()

In [None]:
# Grab Papers Above % Sim Score 

# Define a cut-off threshold
threshold = 30

percentage_df = df[df['Report'].str.contains('%')]

# Strip the '%' and convert to int, handling non-percentage rows
percentage_df['Percentage'] = percentage_df['Report'].str.replace('%', '').astype(int)

# Filter the DataFrame for percentages greater than or equal to the threshold
threshold_reports = percentage_df[percentage_df['Percentage'] >= threshold]

# Save the selected columns to a CSV file
output_df = threshold_reports[['Title', 'Percentage']].drop_duplicates()
output_df.to_csv('reports/exceeds_threashold.csv', index=False)

threshold_reports.head()

In [None]:
from datetime import datetime
# Prepare combined message with date-time stamp and total unique rows
current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
unique_failed_count = failed_reports['Title'].nunique()
unique_threshold_count = threshold_reports['Title'].nunique()
total_unique_rows = df['Title'].nunique()

combined_message = (f"Report generated on: {current_datetime}\n"
                    f"Total unique rows in the original DataFrame: {total_unique_rows}\n"
                    f"{unique_failed_count} unique papers failed to be checked\n"
                    f"{(unique_threshold_count)}/{(total_unique_rows)} papers equal or exceed a sim threshold of {threshold}")

# Write the combined message to a text file
with open('reports/report_summary.txt', 'w') as f:
    f.write(combined_message)

print(combined_message)