In [43]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import regex as re
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications import DenseNet121
from keras.models import load_model
from tensorflow.keras.models import Model
from tqdm.notebook import tqdm
from skimage.transform import resize
import albumentations as A
import imageio
import cv2
import os

In [19]:
reports_df = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_reports.csv')
projections_df = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_projections.csv')

In [20]:
reports_df.head(10)

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.
1,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.
2,3,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XX...",,,"No displaced rib fractures, pneumothorax, or p..."
3,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",XXXX-year-old XXXX with XXXX.,None available,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
4,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,Chest and nasal congestion.,,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.
5,6,normal,normal,"PA and Lateral Chest. XXXX, XXXX at XXXX",Evaluate for infection,"XXXX, XXXX",Heart size and mediastinal contour are within ...,No acute cardiopulmonary findings.
6,7,Pulmonary Atelectasis/base;Spondylosis/thoraci...,Pulmonary Atelectasis;Spondylosis;Arthritis,Xray Chest PA and Lateral,Preop lumbar surgery,"XXXX, XXXX",The cardiac contours are normal. XXXX basilar ...,Basilar atelectasis. No confluent lobar consol...
7,8,normal,normal,Xray Chest PA and Lateral,XXXX-year-old with XXXX on XXXX. Dyspnea. Hist...,Two views of the chest dated XXXX.,"The heart, pulmonary XXXX and mediastinum are ...",No acute cardiopulmonary disease.
8,9,Calcified Granuloma/lung/upper lobe/right;Dens...,Calcified Granuloma;Density,Xray Chest PA and Lateral,Chest pain today. History of stent placement 7...,"CT XXXX image XXXX, CT abdomen which included ...",The XXXX examination consists of frontal and l...,Increased size of density in the left cardioph...
9,10,Calcified Granuloma/lung/upper lobe/right,Calcified Granuloma,PA and lateral chest x-XXXX XXXX.,"XXXX-year-old male, chest pain.",Chest radiographs XXXX.,The cardiomediastinal silhouette is within nor...,No acute cardiopulmonary process.


In [21]:
reports_df.iloc[8,-2]

'The XXXX examination consists of frontal and lateral radiographs of the chest. The cardiac silhouette is not enlarged. There has been apparent interval increase in low density convexity at the left cardiophrenic XXXX. Calcified granuloma is again seen in the right upper lobe. There is no consolidation, pleural effusion or pneumothorax.'

In [22]:
#printing the first 5 records in the reports
reports_df.head()

Unnamed: 0,uid,MeSH,Problems,image,indication,comparison,findings,impression
0,1,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.
1,2,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.
2,3,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XX...",,,"No displaced rib fractures, pneumothorax, or p..."
3,4,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",XXXX-year-old XXXX with XXXX.,None available,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
4,5,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,Chest and nasal congestion.,,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.


In [23]:
#printing the first 5 records in the projections df
projections_df.head()

Unnamed: 0,uid,filename,projection
0,1,1_IM-0001-4001.dcm.png,Frontal
1,1,1_IM-0001-3001.dcm.png,Lateral
2,2,2_IM-0652-1001.dcm.png,Frontal
3,2,2_IM-0652-2001.dcm.png,Lateral
4,3,3_IM-1384-1001.dcm.png,Frontal


In [24]:
print(len(projections_df))
print(len(reports_df))

7466
3851


In [25]:
#counting the no.of unique patients in the dataset 
frontal_mask = projections_df['projection']=='Frontal'
df_frontal = projections_df[frontal_mask]

# Merge the filtered projections dataframe with the reports dataframe
df_merged = pd.merge(df_frontal,reports_df,on='uid')

result_dict = {row['filename']: [row['uid'], row['findings'], row['impression']] for _, row in df_merged.iterrows()}


final_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=['uid', 'findings', 'impression']).reset_index()
final_df = final_df.rename(columns={'index': 'filename'})

# Display the resulting dataframe
final_df.head()


Unnamed: 0,filename,uid,findings,impression
0,1_IM-0001-4001.dcm.png,1,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.
1,2_IM-0652-1001.dcm.png,2,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.
2,3_IM-1384-1001.dcm.png,3,,"No displaced rib fractures, pneumothorax, or p..."
3,4_IM-2050-1001.dcm.png,4,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
4,5_IM-2117-1003002.dcm.png,5,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.


In [26]:
df_reports = pd.read_csv("/kaggle/input/chest-xrays-indiana-university/indiana_reports.csv")
df_reports.head()
findings_counts = df_reports['findings'].value_counts()

# Count the unique captions and their frequencies in 'impression' column
impression_counts = df_reports['impression'].value_counts()

# Display the counts
print("Findings Counts:")
print(findings_counts)

print("\nImpression Counts:")
print(impression_counts)

Findings Counts:
findings
The heart is normal in size. The mediastinum is unremarkable. The lungs are clear.                                                                                                                                                                                                                                                                                                          51
The heart and lungs have XXXX XXXX in the interval. Both lungs are clear and expanded. Heart and mediastinum normal.                                                                                                                                                                                                                                                                        51
Heart size normal. Lungs are clear. XXXX are normal. No pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.                                                                               

In [30]:
df_filtered = final_df[final_df['impression'].str.contains("No acute cardiopulmonary", na=False)]

# Group by 'impression' and select up to 10 rows from each group
df_top_50_each_group = df_filtered.groupby('impression').head(50)

impression_counts = df_top_50_each_group['impression'].value_counts()
print(impression_counts.head(50))

impression
No acute cardiopulmonary process.                                                                                                                                                                                                                                                                                                                                   50
No acute cardiopulmonary disease.                                                                                                                                                                                                                                                                                                                                   50
No acute cardiopulmonary abnormality. .                                                                                                                                                                                                                                        

In [31]:
len(final_df)

3818

In [32]:
drop_mask = []
img_path = '/kaggle/input/chest-xrays-indiana-university/images/images_normalized'
for i in range(len(df_top_50_each_group)):
    df_top_50_each_group.iloc[i,0] = os.path.join(img_path,df_top_50_each_group.iloc[i,0])
    if type(df_top_50_each_group.iloc[i, 2]) == str:
        drop_mask.append(False)
    elif type(df_top_50_each_group.iloc[i, 2]) == float and type(df_top_50_each_group.iloc[i, 3]) != float:
        df_top_50_each_group.iloc[i, 2] = df_top_50_each_group.iloc[i, 3]
        drop_mask.append(False)
    else:
        drop_mask.append(True)

# Drop the rows where the mask is True
final_df = df_top_50_each_group[~pd.Series(drop_mask).values]

# Reset the index if necessary
final_df.reset_index(drop=True, inplace=True)

# Print the cleaned DataFrame
final_df.head()

Unnamed: 0,filename,uid,findings,impression
0,/kaggle/input/chest-xrays-indiana-university/i...,3,"No displaced rib fractures, pneumothorax, or p...","No displaced rib fractures, pneumothorax, or p..."
1,/kaggle/input/chest-xrays-indiana-university/i...,5,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.
2,/kaggle/input/chest-xrays-indiana-university/i...,6,Heart size and mediastinal contour are within ...,No acute cardiopulmonary findings.
3,/kaggle/input/chest-xrays-indiana-university/i...,8,"The heart, pulmonary XXXX and mediastinum are ...",No acute cardiopulmonary disease.
4,/kaggle/input/chest-xrays-indiana-university/i...,10,The cardiomediastinal silhouette is within nor...,No acute cardiopulmonary process.


In [33]:
import math
type(final_df.iloc[1,2]) == str

True

In [34]:
drop_mask = []
img_path = '/kaggle/input/chest-xrays-indiana-university/images/images_normalized'
for i in range(len(final_df)):
    final_df.iloc[i,0] = os.path.join(img_path,final_df.iloc[i,0])
    if type(final_df.iloc[i, 2]) == str:
        drop_mask.append(False)
    elif type(final_df.iloc[i, 2]) == float and type(final_df.iloc[i, 3]) != float:
        final_df.iloc[i, 2] = final_df.iloc[i, 3]
        drop_mask.append(False)
    else:
        drop_mask.append(True)

# Drop the rows where the mask is True
final_df = final_df[~pd.Series(drop_mask).values]

# Reset the index if necessary
final_df.reset_index(drop=True, inplace=True)

# Print the cleaned DataFrame
print(final_df)

                                              filename   uid  \
0    /kaggle/input/chest-xrays-indiana-university/i...     3   
1    /kaggle/input/chest-xrays-indiana-university/i...     5   
2    /kaggle/input/chest-xrays-indiana-university/i...     6   
3    /kaggle/input/chest-xrays-indiana-university/i...     8   
4    /kaggle/input/chest-xrays-indiana-university/i...    10   
..                                                 ...   ...   
818  /kaggle/input/chest-xrays-indiana-university/i...  3971   
819  /kaggle/input/chest-xrays-indiana-university/i...  3975   
820  /kaggle/input/chest-xrays-indiana-university/i...  3985   
821  /kaggle/input/chest-xrays-indiana-university/i...  3989   
822  /kaggle/input/chest-xrays-indiana-university/i...  3996   

                                              findings  \
0    No displaced rib fractures, pneumothorax, or p...   
1    The cardiomediastinal silhouette and pulmonary...   
2    Heart size and mediastinal contour are within ...   

In [35]:
final_df.drop(['impression'],inplace=True,axis=1)

In [16]:
final_df.isnull().sum()

filename    0
uid         0
findings    0
dtype: int64

In [36]:
final_df.head(10)

Unnamed: 0,filename,uid,findings
0,/kaggle/input/chest-xrays-indiana-university/i...,3,"No displaced rib fractures, pneumothorax, or p..."
1,/kaggle/input/chest-xrays-indiana-university/i...,5,The cardiomediastinal silhouette and pulmonary...
2,/kaggle/input/chest-xrays-indiana-university/i...,6,Heart size and mediastinal contour are within ...
3,/kaggle/input/chest-xrays-indiana-university/i...,8,"The heart, pulmonary XXXX and mediastinum are ..."
4,/kaggle/input/chest-xrays-indiana-university/i...,10,The cardiomediastinal silhouette is within nor...
5,/kaggle/input/chest-xrays-indiana-university/i...,11,Cardiomediastinal silhouette and pulmonary vas...
6,/kaggle/input/chest-xrays-indiana-university/i...,12,Lungs are clear bilaterally. Cardiac and media...
7,/kaggle/input/chest-xrays-indiana-university/i...,15,Cardiomediastinal silhouette and pulmonary vas...
8,/kaggle/input/chest-xrays-indiana-university/i...,17,No focal areas of consolidation. No suspicious...
9,/kaggle/input/chest-xrays-indiana-university/i...,18,Heart size within normal limits. No focal alve...


In [None]:
c = 1
A_dir = '/kaggle/working/augimages'

for i in range(len(final_df)):
    original_image = cv2.imread(final_df.iloc[i, 0])
    augmented1 = aug1(image=original_image)['image']
    augmented2 = aug2(image=original_image)['image']
    
    new_img_path1 = os.path.join(A_dir, f"{os.path.basename(final_df.iloc[i, 0])}_aug1.png")
    new_img_path2 = os.path.join(A_dir, f"{os.path.basename(final_df.iloc[i, 0])}_aug2.png")
    
    imageio.imwrite(new_img_path1, augmented1)
    imageio.imwrite(new_img_path2, augmented2)
    
    # Create new rows with augmented images
    new_row1 = {'filename': new_img_path1, 'uid': final_df.iloc[i, 1], 'findings': final_df.iloc[i, 2]}
    new_row2 = {'filename': new_img_path2, 'uid': final_df.iloc[i, 1], 'findings': final_df.iloc[i, 2]}
    
    # Append the new rows in place
    final_df.loc[len(final_df)] = new_row1
    final_df.loc[len(final_df)] = new_row2

    c += 1
    if c % 50 == 0:
        print(c)


50
100
150
200


In [None]:
final_df.tail()

In [None]:
def decontracted(phrase):
  """ This function expands the contractions in the text"""


  
    # specific
  phrase = str(phrase)
  phrase = re.sub(r"won\'t", "will not", phrase)
  phrase = re.sub(r"can\'t", "can not", phrase)

    # general
  phrase = re.sub(r"n\'t", " not", phrase)
  phrase = re.sub(r"\'re", " are", phrase)
  phrase = re.sub(r"\'s", " is", phrase)
  phrase = re.sub(r"\'d", " would", phrase)
  phrase = re.sub(r"\'ll", " will", phrase)
  phrase = re.sub(r"\'t", " not", phrase)
  phrase = re.sub(r"\'ve", " have", phrase)
  phrase = re.sub(r"\'m", " am", phrase)
  phrase = re.sub('xxxx','',phrase) #occurs many times in text may be private information which isn't useful
  return str(phrase)

final_df['findings']= final_df['findings'].apply(decontracted)

In [None]:
def other_processing(phrase):
  """Other text processing mentioned below"""

  phrase = str(phrase)
  phrase = re.sub(r'xx*','',phrase) # Removing XXXX
  phrase = re.sub(r'\d','',phrase) # Removing numbers
  
  temp = ""

  for i in phrase.split(" "): #Removing 2 letter words
    if i!= 'no' or i!='ct':
      temp = temp + ' ' + i
    prev = i
  temp = re.sub(' {2,}', ' ',temp) #Replacing double space with single space
  temp = re.sub(r'\.+', ".", temp) #Replacing double . with single .
  temp = temp.lstrip() #Removing space at the beginning
  temp = temp.rstrip() #Removing space at the end
  return temp



final_df['findings']= final_df['findings'].apply(other_processing)

In [None]:
final_df.head()

In [None]:
def count_word(sent):
  """Counts number of words in given sentence"""
  sent = str(sent)
  return len(sent.split(" "))

In [None]:
num_words = pd.DataFrame(columns=['findings'])
num_words['findings'] = final_df['findings'].apply(count_word)

In [None]:
num_words.describe(percentiles = [0.25,0.75,0.95,0.99])

In [None]:
img_dir = '/kaggle/input/chest-xrays-indiana-university/images/images_normalized'
print("Findings for below image is :" , final_df.at[2,'findings'])
img = cv2.imread(os.path.join(img_dir,final_df.at[2,'filename']))
plt.imshow(cv2.cvtColor(img,cv2.COLOR_BGR2RGB))

In [None]:
print("Findings for below image is :" , final_df.at[9,'findings'])
img = cv2.imread(os.path.join(img_dir,final_df.at[9,'filename']))
plt.imshow(cv2.cvtColor(img,cv2.COLOR_BGR2RGB))

In [None]:
chex_weights = '/kaggle/input/chexnet-weights/brucechou1983_CheXNet_Keras_0.3.0_weights.h5'
chexnet = DenseNet121(weights=chex_weights,                    
                      classes = 14,input_shape=(224,224,3))

In [None]:
chexnet.summary()

In [None]:
model = Model(chexnet.input, chexnet.layers[-2].output)

In [None]:
model.summary()

In [None]:
def load_image(img_name):
  """Loads image in array format"""

  image = Image.open(img_name)
  X = np.asarray(image.convert("RGB"))
  X = np.asarray(X)
  X = preprocess_input(X)
  X = resize(X, (224,224,3))
  X = np.expand_dims(X, axis=0)
  X = np.asarray(X)
    
  return X

In [None]:
image_paths = final_df['filename'].astype(str).tolist()


In [None]:
image_features = []
for i in tqdm(range(len(image_paths))):

  #image 1
  i1 = load_image(image_paths[i])
  img_features = model.predict(i1)
  img_features = np.vstack(img_features).astype(float)



  image_features.append(img_features)



In [None]:
final_df['image_features'] =image_features

In [None]:
final_df['findings_total'] = '<start>' + ' ' + final_df.findings.astype(str) + ' ' +'<end>'
final_df['dec_ip'] = '<start>' + ' ' + final_df.findings.astype(str)  #Decoder input
final_df['dec_op'] = final_df.findings.astype(str) + ' ' +'<end>'     #dDecoder output

In [None]:
final_df.to_pickle('final_1024.pkl') 