In [5]:
import pandas as pd
import numpy as np
import spacy  
import scattertext

nlp = spacy.load('en_core_web_sm')

#Import dataframe
df = pd.read_csv('Scene_Graph_Disease_Only.csv')

#Create dataframe with only patient_id and reason for exam and disease and age and gender
patient_df = df[['study_id','patient_id', 'reason_clean', 'disease_category', 'gender', 'age_decile', ]]

#Remove duplicate rows
patient_df = patient_df.drop_duplicates()

#Drop duplicates where reason_clean is the same for a patient
patient_df = patient_df.drop_duplicates(subset=['patient_id', 'reason_clean'], keep='first')

#remove na's in reason clean for patient df
patient_df = patient_df.dropna(subset=['reason_clean'])

#Create dataframe where disease category is 'fluid overload/heart failure'
hf_df = patient_df[patient_df['disease_category'] == 'fluid overload/heart failure']
#Create dataframe where disease category is 'copd/emphysema'

copd_df = patient_df[patient_df['disease_category'] == 'copd/emphysema']

#Combine the two dataframes
copd_heart_failure_df = pd.concat([copd_df, hf_df])

#Create corpus
corpus = (scattertext.CorpusFromPandas(copd_heart_failure_df,
                                       category_col='disease_category',
                                       text_col='reason_clean',
                                       nlp=nlp)
            .build()
            .remove_terms(nlp.Defaults.stop_words, ignore_absences=True)
            )

corpus_df = corpus.get_term_freq_df()







In [6]:
corpus_df['CHF_Score'] = corpus.get_scaled_f_scores('fluid overload/heart failure')
corpus_df['COPD_Score'] = corpus.get_scaled_f_scores('copd/emphysema')
corpus_df['CHF_Score'] = round(corpus_df['CHF_Score'], 2)
corpus_df['COPD_Score'] = round(corpus_df['COPD_Score'], 2)
    
df_chf = corpus_df.sort_values(by='fluid overload/heart failure freq', 
                             ascending = False).reset_index()
df_copd = corpus_df.sort_values(by='copd/emphysema freq', 
                            ascending=False).reset_index()

df_copd.head(10)
df_chf.head(10)

Unnamed: 0,term,copd/emphysema freq,fluid overload/heart failure freq,CHF_Score,COPD_Score
0,/,849,1285,0.88,0.12
1,interval,389,916,0.93,0.07
2,change,364,842,0.93,0.07
3,interval change,322,772,0.93,0.07
4,p,368,740,0.91,0.09
5,edema,273,736,0.94,0.06
6,//,472,717,0.88,0.12
7,s,336,694,0.92,0.08
8,/ p,335,692,0.92,0.08
9,s /,331,689,0.92,0.08


In [7]:
html = scattertext.produce_scattertext_explorer(
    corpus,
    category='fluid overload/heart failure', category_name='HF', not_category_name='COPD',
    minimum_term_frequency=10, pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    transform=scattertext.Scalers.dense_rank
)

with open('./scattertext_copd_hf.html', 'w', encoding='utf-8') as file:
    file.write(html)