In [1]:
import pandas as pd
from transformers import BloomTokenizerFast, BloomForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel("Seer_All.xlsx")

In [3]:
df.columns

Index(['Patient ID', 'Age recode with single ages and 85+',
       'Site recode ICD-O-3/WHO 2008', 'CS version input original (2004-2015)',
       'RX Summ--Surg Prim Site (1998+)', 'Year of diagnosis',
       'ICD-O-3 Hist/behav', 'CS extension (2004-2015)',
       'First malignant primary indicator', 'Grade (thru 2017)',
       'CS version input current (2004-2015)', 'Primary Site', 'Laterality',
       'Survival months', 'Sex', 'Race/ethnicity',
       'Median household income inflation adj to 2019'],
      dtype='object')

In [4]:
# Specify the columns you want to check for missing values
cols_to_check = ['Patient ID', 'Age recode with single ages and 85+',
                 'Site recode ICD-O-3/WHO 2008', 'CS version input original (2004-2015)',
                 'RX Summ--Surg Prim Site (1998+)', 'Year of diagnosis',
                 'ICD-O-3 Hist/behav', 'CS extension (2004-2015)',
                 'First malignant primary indicator', 'Grade (thru 2017)',
                 'CS version input current (2004-2015)', 'Primary Site', 'Laterality',
                 'Survival months', 'Sex', 'Race/ethnicity',
                 'Median household income inflation adj to 2019']

In [5]:
df_analysis = df.copy()


In [6]:
df_analysis[cols_to_check] = df[cols_to_check].replace(['blank', 'Blank(s)','Unknown'], pd.NA)

In [7]:
# Check for missing values in the specified columns
missing_values = df_analysis[cols_to_check].isnull().sum()

print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Patient ID                                            0
Age recode with single ages and 85+                   0
Site recode ICD-O-3/WHO 2008                          0
CS version input original (2004-2015)            789489
RX Summ--Surg Prim Site (1998+)                  601526
Year of diagnosis                                     0
ICD-O-3 Hist/behav                                    0
CS extension (2004-2015)                         789489
First malignant primary indicator                     0
Grade (thru 2017)                                463888
CS version input current (2004-2015)             789489
Primary Site                                          0
Laterality                                            0
Survival months                                   16449
Sex                                                   0
Race/ethnicity                                     6828
Median household income inflation adj to 2019         0
dtype: int64


In [8]:

# Check for missing values in the specified columns
missing_values = df[cols_to_check].isnull().sum()

print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Patient ID                                       0
Age recode with single ages and 85+              0
Site recode ICD-O-3/WHO 2008                     0
CS version input original (2004-2015)            0
RX Summ--Surg Prim Site (1998+)                  0
Year of diagnosis                                0
ICD-O-3 Hist/behav                               0
CS extension (2004-2015)                         0
First malignant primary indicator                0
Grade (thru 2017)                                0
CS version input current (2004-2015)             0
Primary Site                                     0
Laterality                                       0
Survival months                                  0
Sex                                              0
Race/ethnicity                                   0
Median household income inflation adj to 2019    0
dtype: int64


In [9]:
df_original = df_analysis.copy()

cols_to_check = ['Patient ID', 'Age recode with single ages and 85+',
                 'Site recode ICD-O-3/WHO 2008', 'CS version input original (2004-2015)',
                 'RX Summ--Surg Prim Site (1998+)', 'Year of diagnosis',
                 'ICD-O-3 Hist/behav', 'CS extension (2004-2015)',
                 'First malignant primary indicator', 'Grade (thru 2017)',
                 'CS version input current (2004-2015)', 'Primary Site', 'Laterality',
                 'Survival months', 'Sex', 'Race/ethnicity',
                 'Median household income inflation adj to 2019']

df_cleaned = df_analysis.dropna(subset=cols_to_check)

result = df_cleaned.groupby('Site recode ICD-O-3/WHO 2008').size() / df_original.groupby('Site recode ICD-O-3/WHO 2008').size() * 100

result_df = pd.DataFrame({'Percentage Remaining': result})

result_df.to_csv('result.txt', sep='\t')



In [11]:
import json

list_template_rows = []
for index, row in df_cleaned.iterrows():
    row_dict = {col: row[col] for col in cols_to_check}
    list_template_rows.append(row_dict)

json_file = 'output.json'
text_file = 'output.txt'

with open(json_file, 'w') as f:
    json.dump(list_template_rows, f, indent=2)

text_template_rows = "\n".join([", ".join([f"{col.lower()} is {row[col]}" for col in cols_to_check]) for index, row in df.iterrows()])

with open('output.txt', 'w') as f:
    f.write(text_template_rows)



Data saved to output.txt


In [12]:


with open('doctornote.txt', 'w') as file:

    for index, row in df_cleaned.iterrows():
        patient_id = row['Patient ID']
        age = row['Age recode with single ages and 85+']
        diagnosis_year = row['Year of diagnosis']
        primary_site = row['Primary Site']
        surgery_site = row['RX Summ--Surg Prim Site (1998+)']
        survival_months = row['Survival months']
        sex = row['Sex']
        race_ethnicity = row['Race/ethnicity']
        income = row['Median household income inflation adj to 2019']

        # Create a more sophisticated doctor note
        doctor_note = f"""
        **Patient Information:**
        - Patient ID: {patient_id}
        - Age: {age}
        - Sex: {sex}
        - Race/Ethnicity: {race_ethnicity}

        **Medical History:**
        - Diagnosis Year: {diagnosis_year}
        - Primary Site: {primary_site}
        - Surgery Site: {surgery_site}
        - Survival Months: {survival_months}

        **Recommendations:**
        - The patient has been diagnosed with a primary tumor at the site {primary_site}.
        - A surgical procedure ({surgery_site}) was performed, and the patient has survived for {survival_months} months.
        - Further tests and follow-ups are recommended to monitor the patient's condition.

        **Additional Notes:**
        - Income Level: {income}
        
        """

        file.write(doctor_note + '\n\n')


In [15]:
import random

columns = df_cleaned.columns.tolist()

random.shuffle(columns)

# Iterate over each row
for index, row in df_cleaned.iterrows():
    # Create a template with randomly ordered columns and values for each row
    template_permuted = '- ' + '\n- '.join(f'{column}: {row[column]}' for column in columns)

    # Save the permuted template to a file (if needed)
    with open('permuted_template.txt', 'a') as file:
        file.write(template_permuted + '\n\n')

In [26]:
model_ckpt = "mrm8488/bloom-560m-finetuned-totto-table-to-text"
tokenizer = BloomTokenizerFast.from_pretrained(model_ckpt)

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model onto the device
model = BloomForCausalLM.from_pretrained(model_ckpt).to(device)

def explain_and_store_all_rows(df, cols, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for index, row in df.iterrows():
            linearized_table = ", ".join([f"{col.lower()} is {row[col]}" for col in cols])

            prompt = f"Explain in detail the following data: {linearized_table}"

            inputs = tokenizer(prompt, return_tensors='pt')
            
            inputs = {key: val.to(device) for key, val in inputs.items()}
            
            with torch.no_grad():
                output = model.generate(**inputs)

            explanation = tokenizer.decode(output[0], skip_special_tokens=False)

            f.write(f"\n\nExample Index: {index}\n")
            f.write("Linearized Table:\n")
            f.write(linearized_table + '\n')
            f.write("Explanation:\n")
            f.write(explanation + '\n')

output_file = 'bloomtabletotext.txt'


explain_and_store_all_rows(df_cleaned, cols_to_check, output_file)




KeyboardInterrupt: 

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp", use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp")

def generate_summary_with_context(row, cols):
    input_text = f"Explain the significance of the following data: {', '.join([f'{col.lower()} is {row[col]}' for col in cols])}"

    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(inputs)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return input_text, summary

all_summaries = []
for _, row in df_cleaned.iterrows():
    input_text, summary = generate_summary_with_context(row, cols_to_check)
    all_summaries.append((input_text, summary))

output_file_path = "t0pptext.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
    for input_text, summary in all_summaries:
        output_file.write(f"Input Text with Context:\n{input_text}\n\nGenerated Summary:\n{summary}\n\n{'='*50}\n")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return self.fget.__get__(instance, owner)()


: 