In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import librosa


In [8]:
import warnings
warnings.filterwarnings('ignore')

## Exploring Data

In [6]:
df_diagnosis = pd.read_csv(r'Respiratory_Sound_Database\patient_diagnosis.csv', names=['patient_id', 'diagnosis'])
df_diagnosis.head()

Unnamed: 0,patient_id,diagnosis
0,101,URTI
1,102,Healthy
2,103,Asthma
3,104,COPD
4,105,URTI


In [13]:
df_diagnosis['diagnosis'].value_counts()#.plot(kind='bar', title='Number of patients per diagnosis')


COPD              64
Healthy           26
URTI              14
Bronchiectasis     7
Pneumonia          6
Bronchiolitis      6
LRTI               2
Asthma             1
Name: diagnosis, dtype: int64

In [17]:
df_demo=pd.read_csv(r'Respiratory_Sound_Database\demographic_info.txt',delimiter=' ',names=['patient_id','age','sex','adult_bmi','child_weight','child_height'])
df_demo.head()

Unnamed: 0,patient_id,age,sex,adult_bmi,child_weight,child_height
0,101,3.0,F,,19.0,99.0
1,102,0.75,F,,9.8,73.0
2,103,70.0,F,33.0,,
3,104,70.0,F,28.47,,
4,105,7.0,F,,32.0,135.0


In [21]:
# will use this data for future analysis
df_merged = pd.merge(df_diagnosis, df_demo, on='patient_id')
df_merged.to_csv(r'Respiratory_Sound_Database\patient_detaila.csv', index=False)

In [38]:
#creating dataframe for audio_text files
text_files=glob.glob(r'Respiratory_Sound_Database\audio_and_txt_files\*.txt')
text_files = [os.path.splitext(os.path.basename(i))[0] for i in text_files]


In [41]:
df = pd.DataFrame([file.split('_') for file in text_files], 
                  columns=['patient_id', 'recording_index', 'chest_location', 'acquisition_mode', 'recording_equipment'])
df.head()
df['patient_id'] = df['patient_id'].astype(int)


In [45]:
df_merged2 = pd.merge(df_diagnosis, df, on='patient_id')
df_merged2.to_csv(r'Respiratory_Sound_Database\diagnosis_details.csv', index=False)

In [46]:
df_merged2.head()

Unnamed: 0,patient_id,diagnosis,recording_index,chest_location,acquisition_mode,recording_equipment
0,101,URTI,1b1,Al,sc,Meditron
1,101,URTI,1b1,Pr,sc,Meditron
2,102,Healthy,1b1,Ar,sc,Meditron
3,103,Asthma,2b2,Ar,mc,LittC2SE
4,104,COPD,1b1,Al,sc,Litt3200


In [48]:
df_merged2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   patient_id           920 non-null    int64 
 1   diagnosis            920 non-null    object
 2   recording_index      920 non-null    object
 3   chest_location       920 non-null    object
 4   acquisition_mode     920 non-null    object
 5   recording_equipment  920 non-null    object
dtypes: int64(1), object(5)
memory usage: 50.3+ KB


## Making random Dataset for our prescription model

In [34]:
#creating random dataset for prescriotion model
n_samples=3000

data={
    'Patient ID': np.arange(1,n_samples+1),
    'Age':np.random.randint(1,100,n_samples),
    'Gender':np.random.choice(['Male','Female'],n_samples),
    'Smoking Status':np.random.choice(['Non-smoker','Ex-smoker','Active-smoker'],n_samples),
    'Disease':np.random.choice(['COPD','Pneumonia','Healthy','URTI','Bronchiectasis','Bronchiolitis','LRTI','Asthma'],n_samples),
    'Medication Prescribed':np.random.choice(['Drug A','Drug B','Drug C','Drug D'],n_samples),
    'Dosage':np.random.choice(['5mg','10mg','20mg','50mg'],n_samples),
    'Frequencey':np.random.choice(['Once daily','Twice daily','Three times daily'],n_samples)
}
df=pd.DataFrame(data)
df.head(),df.shape

(   Patient ID  Age  Gender Smoking Status         Disease  \
 0           1   83  Female  Active-smoker   Bronchiolitis   
 1           2   72    Male     Non-smoker  Bronchiectasis   
 2           3   35    Male      Ex-smoker       Pneumonia   
 3           4   13    Male  Active-smoker            LRTI   
 4           5   70    Male  Active-smoker            URTI   
 
   Medication Prescribed Dosage         Frequencey  
 0                Drug D   50mg        Twice daily  
 1                Drug C   50mg         Once daily  
 2                Drug C    5mg  Three times daily  
 3                Drug C   20mg         Once daily  
 4                Drug C    5mg  Three times daily  ,
 (3000, 8))

In [35]:
#adjusting values for healthy patients
df.loc[df['Disease'] == 'Healthy', ['Medication Prescribed', 'Dosage', 'Frequencey']] = 'No Medication', 'N/A', 'N/A'

In [36]:
df[df['Disease']=='Healthy'][['Dosage','Frequencey','Medication Prescribed']].head()

Unnamed: 0,Dosage,Frequencey,Medication Prescribed
5,,,No Medication
7,,,No Medication
27,,,No Medication
29,,,No Medication
36,,,No Medication


In [37]:
#adjusting age of people with COPD and UTRI
filter=df['Disease'].isin(['COPD','Bronchiectasis'])
df.loc[filter,'Age']=np.random.randint(40,100,len(df[filter]))

In [39]:
df[filter]['Age'].head()

1     41
14    46
18    96
21    66
22    66
Name: Age, dtype: int32

In [41]:
df.to_csv('medicine_prescription.csv',index=False)