In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

<a href="https://colab.research.google.com/github/Subhash-K45/python/blob/main/COPD/COPD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras import layers,models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.metrics import classification_report,accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
scaler = StandardScaler()
rfc = RandomForestClassifier(n_estimators=1000,max_depth = 2,random_state=42,)
svc = SVC(kernel = 'linear')

In [None]:
data  = pd.read_csv('/kaggle/input/copd-student-dataset/dataset.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
print(data['copd'].value_counts())
print(data['COPDSEVERITY'].value_counts())

# Eğitim ve test setlerine ayır
X_egitim, X_test, y_egitim, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE uygula
smote = SMOTE(random_state=42)
X_egitim_yeni, y_egitim_yeni = smote.fit_resample(X_egitim, y_egitim)

# Dengeledikten sonraki sınıf dağılımını kontrol et
print("Dengeledikten Sonra Sınıf Dağılımı:")
print(y_egitim_yeni.value_counts())

# Eğitim setini kullanarak modeli eğit
# (Burada bir sınıflandırma modeli seçip eğitebilirsiniz)

# Dengeledikten sonra veri setini birleştir
denge_veri = pd.concat([X_egitim_yeni, y_egitim_yeni], axis=1)

Dropping the unwanted columns to make the results more accurate

In [None]:
columns = ['Unnamed: 0','ID','COPDSEVERITY','MWT1','MWT2']
data.drop(columns=columns, axis=1,inplace=True)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.drop(data[data['AGE']==10].index,axis=0,inplace=True)
data.drop(data[data['AGE']==30].index,axis=0,inplace=True)

Finding the null and duplicated values

In [None]:
data.isna().sum()

In [None]:
data.fillna(data.mean(),inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.describe()

Exploratory Data Anaylsis

In [None]:
data.drop(data[data['copd']==30].index,axis=0,inplace=True)
data.drop(data[data['copd']==10].index,axis=0,inplace=True)

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(),annot=True,cmap='rocket',fmt='.2f')

Age Distributions

In [None]:
age = pd.DataFrame(data['AGE'].value_counts(bins=3)).reset_index()
age.columns = ['Range','Age']
age['Range'].astype(str)
for i,col in enumerate(age):
  for row in range(age.shape[0]):
    age[col][row] = age[col].astype('str')[row][1:-1]
  break
age['Range'][2]  = '43.9551, 58.667'

In [None]:
plt.figure(figsize=(6,5))
sns.barplot(data = age , x='Range',y='Age',palette = 'viridis')
plt.title('COPD = 2 For different age category')
plt.xticks(rotation=0)
plt.show()

COPD - Severity levels
(VERY SEVERE - 4
SEVERE      - 3
MODERATE    - 2
MILD        - 1)

In [None]:
data['copd'].value_counts()

In [None]:
df = data.groupby([pd.cut(data['AGE'], bins=3),'copd']).size().unstack().reset_index().rename(columns={'index': 'AGE'})

In [None]:
df_melted = pd.melt(df, id_vars=['AGE'], var_name='COPD', value_name='Count')
sns.barplot(x='AGE', y='Count', hue='COPD', data=df_melted, palette='viridis')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Bar Plot of COPD Counts within Age Ranges')
plt.legend(title='COPD')
plt.show()

Smoking Patterns

How does PackHistory vary between smokers and non-smokers?

In [None]:
fig,ax = plt.subplots(1,2,figsize = (10,6))
for count,i in enumerate(data['smoking'].unique()):
    ax[count].set_title(f'Smoking: {i}')
    ax[count].plot(data[data['smoking']==i]['PackHistory'])

Physical Performance

In [None]:
columns = ['AGE','MWT1Best']
df = pd.DataFrame({col : data[col] for col in columns})
df['Age Range'] = pd.cut(df['AGE'],bins=3)
df['MWT1Best'] = pd.to_numeric(df['MWT1Best'], errors='coerce')
df.groupby('Age Range')['MWT1Best'].mean().reset_index()
df['Age Range']=df['Age Range'].astype('str')
for i,col in enumerate(df['Age Range']):
    df['Age Range'][i] = col[1:-1]

In [None]:
sns.barplot(data = df,x='Age Range',y='MWT1Best',palette='viridis')

Lung Function Analysis                                                  
Are there relationships between lung metrics and diabetes?

In [None]:
df = data[['FVC','Diabetes']]
df['FVC Range'] = pd.cut(df.FVC,bins=3)
df = df.groupby(['FVC Range','Diabetes']).size().unstack(fill_value=0).reset_index()
df

In [None]:
df_melted = pd.melt(df, id_vars=['FVC Range'], var_name='Diabetes', value_name='Count')
df_melted['FVC Range'] = df_melted['FVC Range'].astype(str)
for i,row in enumerate(df_melted['FVC Range']):
  df_melted['FVC Range'][i] = row[1:-1]
sns.barplot(data=df_melted,x='FVC Range',y='Count',hue='Diabetes',palette='viridis')

Health Scores:

What is the distribution of CAT, HAD, and SGRQ scores?

In [None]:
#CAT Scores
sns.boxplot(data['CAT'],saturation=0.75,color='orange')

In [None]:
data['CAT'].describe()

In [None]:
sns.distplot(data['CAT'])

In [None]:
data['CAT'].value_counts(bins=2)

In [None]:
data['gender'].unique()

In [None]:
data['gender'].value_counts().max()

In [None]:
data.loc[data['gender'] == 10, 'gender'] = 1
data.loc[data['gender'] == 30, 'gender'] = 1

In [None]:
data.loc[data['CAT'] >=100,'CAT'] = data['CAT'].median()

In [None]:
df = data[['CAT','gender']]
df['CAT Range'] = pd.cut(data['CAT'],bins=2)
res = df.groupby(['CAT Range','gender']).size().unstack().reset_index()
res

In [None]:
df_melted = pd.melt(res,id_vars= 'CAT Range',var_name = 'gender',value_name='Count')
df_melted['CAT Range'] = df_melted['CAT Range'].astype('str')
for i,row in enumerate(df_melted['CAT Range']):
 df_melted['CAT Range'][i] = row[1:-1]
sns.barplot(data = df_melted,x='CAT Range',y='Count',hue='gender',palette='viridis',saturation=0.5)

In [None]:
data['HAD'].describe()

In [None]:
sns.boxplot(data['HAD'])

In [None]:
sns.distplot(data['HAD'])

In [None]:
data[data['HAD'] >= 50] = data['HAD'].median()

In [None]:
sns.boxplot(data['HAD'])

In [None]:
data['HAD'].value_counts(bins = 3)

In [None]:
data.loc[data['gender'] == 10,'gender'] = 1

In [None]:
df = data[['HAD','gender']]
print(df['gender'].unique())
df['HAD Range'] = pd.cut(data['HAD'],bins=3)
res = df.groupby(['HAD Range','gender']).size().unstack().reset_index()
res

In [None]:
df_melt = pd.melt(res,id_vars = 'HAD Range',var_name='gender',value_name='Counts')
df_melt['HAD Range'] = df_melt['HAD Range'].astype('str')
for i,row in enumerate(df_melt['HAD Range']):
 df_melt['HAD Range'][i] = row[1:-1]
sns.barplot(data=df_melt,x='HAD Range',y='Counts',hue='gender',palette='viridis')

COPD Analysis:

What is the COPD prevalence, and how does it differ by gender?

In [None]:
data[data['gender']=='10']

In [None]:
df = data[['gender','copd']]
gender_fem = {}
for x in df[df['gender']==0]['copd']:
  if(x in gender_fem):
    gender_fem[x]+=1
  else:
    gender_fem[x] =1
females_COPD  = df.groupby(['gender','copd']).size().unstack()
females_COPD.reset_index(inplace=True)
females_COPD.drop(10, axis=1, inplace=True)

In [None]:
res = females_COPD.melt(id_vars='gender',var_name='copd',value_name='Count')
custom_palette = sns.color_palette("Paired", 3)
sns.barplot(data=res,x='gender',y='Count',hue='copd',palette=custom_palette)

In [None]:
print(females_COPD.columns)

Prediction of COPD-Severity

In [None]:
data.drop(data[data['AGE']==10].index,inplace=True,axis=0)

In [None]:
#splitting the data
X=data.loc[:,data.columns!='copd']
y=data['copd']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
y_train.unique()

In [None]:
X_train.isna().sum()

In [None]:
rfc_model = rfc.fit(X_train_scaled,y_train)
svc_model = svc.fit(X_train_scaled,y_train)

In [None]:
rfc_pred = rfc_model.predict(X_test_scaled)
svc_pred = svc_model.predict(X_test_scaled)

In [None]:
print(classification_report(rfc_pred,y_test))
print(classification_report(svc_pred,y_test))

In [None]:
print(f'Accuracy of Random Forest Classifier  : {accuracy_score(rfc_pred,y_test)*100}%')
print(f'Accuracy of Random Forest Classifier  : {accuracy_score(svc_pred,y_test)*100}%')

Building an ANN

In [None]:
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(18,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(4, activation='softmax') 
])


In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
y_train-=1

In [None]:
y_test-=1

In [None]:
history = model.fit(X_train_scaled, y_train,epochs=50,batch_size=32)

In [None]:
acc = history.history['accuracy']
loss = history.history['loss']

In [None]:
print(f'Accuracy of ANN is {round(acc[-1] * 100, 2)}%')
print(f'Loss of ANN is {loss[-1]}')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

# Plot training accuracy
ax[0].plot(acc,color='orange')
ax[0].set_title('Accuracy')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')

# Plot training loss
ax[1].plot(loss)
ax[1].set_title('Loss')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss')

plt.show()