# Data Processing 
10/2/24

J. Chanenson

## Convert Table To Useful Data

In [1]:
import pandas as pd
import os

In [2]:
# Read the CSV file
df = pd.read_csv("extracted_table_data.csv")
# Make output folder
os.makedirs('reports', exist_ok=True) 

In [3]:
# Keep only the specified columns
df = df[['Title', 'Report']]

# Convert Title and Report to string
df['Title'] = df['Title'].astype(str)
df['Report'] = df['Report'].astype(str)

# Extract the integer part for Title and create a new column for original title
df['Original_Title'] = df['Title']
df['Title'] = df['Title'].str.extract(r'(\d+)-doc\.pdf')[0].astype(int)

# Split the Original_Title on " - " and take the last part (which should be the word count)
word_counts = df['Original_Title'].str.split(' - ').str[-1].str.replace(' words', '')

# Clean the word counts, replace any NaN values with '0', and convert to int
df['Words'] = word_counts.fillna('0').str.replace(',', '').astype(int)


# Optionally, drop the temporary 'Original_Title' column if you don't need it
df.drop(columns=['Original_Title'], inplace=True)

## Export Failed Papers and Papers That Trigger Review

In [4]:
# Grab Failed Reports
failed_reports = df[df['Report'] == 'Failed']

# Save the failed reports to a CSV file
failed_reports.to_csv('reports/failed_reports.csv', index=False)

failed_reports.head()

Unnamed: 0,Title,Report,Words
0,9183,Failed,0
1,9863,Failed,0
210,9863,Failed,0
228,9183,Failed,0


In [5]:
# Grab Papers Above % Sim Score 

# Define a cut-off threshold
threshold = 30

percentage_df = df[df['Report'].str.contains('%')]

# Strip the '%' and convert to int, handling non-percentage rows
percentage_df['Percentage'] = percentage_df['Report'].str.replace('%', '').astype(int)

# Filter the DataFrame for percentages greater than or equal to the threshold
threshold_reports = percentage_df[percentage_df['Percentage'] >= threshold]

# Save the selected columns to a CSV file
output_df = threshold_reports[['Title', 'Percentage']]
output_df.to_csv('reports/exceeds_threashold.csv', index=False)

threshold_reports.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentage_df['Percentage'] = percentage_df['Report'].str.replace('%', '').astype(int)


Unnamed: 0,Title,Report,Words,Percentage
20,2086,30%,9924,30
23,1462,31%,8207,31
26,3798,56%,21392,56
27,5623,33%,17809,33
28,4942,36%,19055,36


In [6]:
print(f"{len(threshold_reports)}/{len(df)} papers equal or exceed a sim thresashold of {threshold}")

147/1207 papers equal or exceed a sim thresashold of 30
