In [76]:
import pandas as pd

In [78]:
df= pd.read_csv("smoking.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'smoking.csv'

In [None]:
##Categorized AST values into: 'Low level', 'Normal', 'High level', or 'Unknown'.
def classify_ast(value):
    if pd.isnull(value):
        return 'Unknown'
    elif value < 5:
        return 'Low level'
    elif 5 <= value <= 40:
        return 'Normal'
    elif value > 40:
        return 'High level'
df['AST_status'] = df['AST'].apply(classify_ast)

In [None]:
##Categorized ALT values similarly into severity levels. 
def classify_alt(value):
    if pd.isnull(value):
        return 'Unknown'
    elif value < 7:
        return 'Low level'
    elif 7 <= value <= 56:
        return 'Normal'
    elif value > 56:
        return 'High level'
df['ALT_status'] = df['ALT'].apply(classify_alt)

In [None]:
##Classified GGT (Gtp) into: 'Normal', 'High', or 'Very High / Abnormal'.
def classify_ggt(value):
    if value >= 500:
        return 'Very High / Abnormal'
    elif value > 60:
        return 'High'
    else:
        return 'Normal'

df['GGT_status'] = df['Gtp'].apply(classify_ggt)

In [83]:
##Combined AST, ALT, and GGT status to generate a liver health interpretation.
def interpret_liver_profile(row):
    ast = row['AST_status']
    alt = row['ALT_status']
    ggt = row['GGT_status']

    # Normalize strings for safe comparison
    ast = str(ast).strip().lower()
    alt = str(alt).strip().lower()
    ggt = str(ggt).strip().lower()

    if ast == 'normal' and alt == 'normal' and ggt == 'normal':
        return 'Normal liver profile'
    elif ast == 'high level' and alt == 'high level' and ggt == 'high':
        return 'Severe liver or biliary disease'
    elif ast == 'high level' and alt == 'normal' and ggt == 'high':
        return 'Alcohol-related liver disease'
    elif ggt == 'very high':
        return 'Serious biliary disease or chronic alcohol use'
    else:
        return 'Unclassified pattern'

# Apply to DataFrame
df['Liver_status'] = df.apply(interpret_liver_profile, axis=1)

In [84]:
##Classified serum creatinine based on gender into 'Low', 'Normal', or 'High' levels.
def classify_creatinine(row):
    value = row['serum creatinine']
    gender = str(row['gender']).strip().upper()

    if pd.isnull(value):
        return 'unknown'

    if gender == 'M':
        if 0.7 <= value <= 1.3:
            return 'Normal level'
        elif value > 1.3:
            return 'high level'
        elif value < 0.7:
            return 'low level'
    elif gender == 'F':
        if 0.6 <= value <= 1.1:
            return 'Normal level'
        elif value > 1.1:
            return 'high level'
        elif value < 0.6:
            return 'low level'
    else:
        return 'unknown'
        
df['creatinine_status'] = df.apply(classify_creatinine, axis=1)

In [85]:
##Classified Urine protein levels into severity ranges or marked as 'unknown'.
def classify_urine_protein(value):
    if pd.isnull(value):
        return 'unknown'
    try:
        value = int(value)
    except:
        return 'unknown'
    
    if value in [1, 2]:
        return 'Negative'
    elif value in [3, 4]:
        return 'Low level (proteinuria)'
    elif value in [5, 6]:
        return 'High level (kidney damage)'
    else:
        return 'unknown'
        
df['urine_protein_status'] = df['Urine protein'].apply(classify_urine_protein)

In [86]:
##Derived overall kidney function by combining creatinine and urine protein statuses.
def kidney_status(row):
    creat_status = row['creatinine_status']
    urine_status = row['urine_protein_status']

    if creat_status == 'Normal level' and urine_status == 'Negative':
        return 'Normal'
    else:
        return 'Abnormal Kidney Function'

df['kidney_status'] = df.apply(kidney_status , axis=1)

In [88]:
##Calculated BMI from height(cm) and weight(kg).
df['BMI'] = df['weight(kg)'] / ((df['height(cm)'] / 100) ** 2)

In [89]:
##Categorized BMI into: Underweight, Normal, Overweight, Obese, or Morbidly Obese.
def classify_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi <= 24.9:
        return 'Normal weight'
    elif 25.0 <= bmi <= 29.9:
        return 'Overweight'
    elif 30.0 <= bmi <= 39.9:
        return 'Obese'
    else:
        return 'Morbidly Obese'

df['BMI_category'] = df['BMI'].apply(classify_bmi)

In [90]:
##Evaluated vision quality based on eyesight(left) and eyesight(right).
def classify_vision_score(eye_value):
    if eye_value >= 1.0:
        return 'Normal vision'
    elif 0.3 <= eye_value < 1.0:
        return 'Low vision'
    elif 0.1 <= eye_value < 0.3:
        return 'Poor vision'
    else:
        return 'Very poor'

def classify_overall_vision(row):
    left = classify_vision_score(row['eyesight(left)'])
    right = classify_vision_score(row['eyesight(right)'])

    if left == 'Normal vision' and right == 'Normal vision':
        return 'Normal vision'
    elif 'Poor vision' in [left, right]:
        return 'Poor vision'
    elif 'Very poort' in [left, right]:
        return 'Very poor'
    else:
        return 'Low vision'

df['vision_status'] = df.apply(classify_overall_vision, axis=1)

In [93]:
##Assessed hearing status using hearing(left) and hearing(right) columns.
def classify_hearing(row):
    left = row['hearing(left)']
    right = row['hearing(right)']
    
    if left == 1 and right == 1:
        return 'Good'
    elif left == 2 and right == 2:
        return 'Poor'
    else:
        return 'Partial'

df['Hearing Status'] = df.apply(classify_hearing, axis=1)

In [94]:
##Categorized blood pressure using systolic and relaxation (diastolic) values.
def blood_pressure_category(row):
    systolic = row['systolic']
    diastolic = row['relaxation']  # This is diastolic pressure

    if systolic >= 130 or diastolic >= 80:
        return 'Hypertension'
    elif systolic <= 90 or diastolic <= 60:
        return 'Hypotension'
    else:
        return 'Normal'

df['blood_pressure_status'] = df.apply(blood_pressure_category, axis=1)

In [95]:
##Classified fasting blood sugar into: Normal, Prediabetes, or Diabetes.
def classify_fbs(value):
    if value < 100:
        return 'Normal'
    elif 100 <= value <= 125:
        return 'Prediabetes'
    else:
        return 'Diabetes'

df['FBS_status'] = df['fasting blood sugar'].apply(classify_fbs)

In [99]:
##Classified cholesterol into: too high, healthy, slightly high , or too hight.
def classify_cholesterol(value):
    if value < 120:
        return 'Too Low'
    elif 120 <= value < 200:
        return 'Healthy'
    elif 200 <= value <= 239:
        return 'Slightly High'
    else:
        return 'Too High'

df['Cholesterol_status'] = df['Cholesterol'].apply(classify_cholesterol)


In [100]:
##Classified hemoglobin into: Normal, high, or low.
def classify_hemoglobin(row):
    hgb = row['hemoglobin']
    gender = row['gender'].upper() 

    if gender == 'M':
        if hgb >= 13.5 and hgb <= 17.5:
            return 'Normal'
        elif hgb > 17.5:
            return 'High'
        elif hgb < 13.5:
            return 'Low'
    elif gender == 'F':
        if hgb >= 12.0 and hgb <= 15.5:
            return 'Normal'
        elif hgb > 15.5:
            return 'High'
        elif hgb < 12.0:
            return 'Low'

df['Hemoglobin_status'] = df.apply(classify_hemoglobin, axis=1)

In [101]:
df['smoking'] = df['smoking'].map({0: 'Non-smoker', 1: 'Smoker'})

In [103]:
def classify_waist_risk(row):
    waist = row['waist(cm)']
    gender = str(row['gender']).strip().upper()

    if pd.isnull(waist):
        return 'Unknown'

    if gender == 'M':
        return 'High risk' if waist > 102 else 'Normal'
    elif gender == 'F':
        return 'High risk' if waist > 88 else 'Normal'
    else:
        return 'Unknown'
df['waist_risk'] = df.apply(classify_waist_risk, axis=1)

In [104]:
def classify_ldl(ldl):
    if pd.isnull(ldl):
        return 'Unknown'
    elif ldl < 100:
        return 'Heart healthy'
    elif 100 <= ldl <= 159:
        return 'At-risk'
    elif ldl >= 160:
        return 'Dangerous'
    else:
        return 'Unknown'

def classify_hdl(row):
    hdl = row['HDL']
    gender = str(row['gender']).strip().upper()

    if pd.isnull(hdl):
        return 'Unknown'

    if gender == 'M':
        if hdl < 40:
            return 'Dangerous'
        elif 40 <= hdl <= 59:
            return 'At-risk'
        elif hdl >= 60:
            return 'Heart healthy'
    elif gender == 'F':
        if hdl < 50:
            return 'Dangerous'
        elif 50 <= hdl <= 59:
            return 'At-risk'
        elif hdl >= 60:
            return 'Heart healthy'
    else:
        return 'Unknown'
df['LDL_status'] = df['LDL'].apply(classify_ldl)
df['HDL_status'] = df.apply(classify_hdl, axis=1)

In [105]:
def classify_triglyceride(value):
    if pd.isnull(value):
        return 'Unknown'
    elif value < 150:
        return 'Normal'
    elif 150 <= value <= 199:
        return 'Borderline High'
    elif 200 <= value <= 499:
        return 'High'
    elif value >= 500:
        return 'Very High'
    else:
        return 'Unknown'
df['triglyceride_status'] = df['triglyceride'].apply(classify_triglyceride)

In [107]:

def create_feature_list(row):
    return [
        row['waist(cm)'],
        row['triglyceride'],
        row['HDL'],
        row['systolic'],
        row['relaxation'],
        row['fasting blood sugar']
    ]

df['features_list'] = df.apply(create_feature_list, axis=1)


def check_metabolic_syndrome(row):
    criteria_met = 0
    gender = row['gender'].lower() if isinstance(row['gender'], str) else 'unknown'

    waist, triglyceride, hdl, systolic, diastolic, glucose = row['features_list']

    # 1. Abdominal obesity
    if gender == 'male' and waist >= 102:
        criteria_met += 1
    elif gender == 'female' and waist >= 88:
        criteria_met += 1
   # 2. High triglyceride
    if triglyceride >= 150:
        criteria_met += 1

    # 3. Low HDL
    if gender == 'male' and hdl < 40:
        criteria_met += 1
    elif gender == 'female' and hdl < 50:
        criteria_met += 1

    # 4. Blood pressure
    if systolic >= 130 or diastolic >= 85:
        criteria_met += 1

    # 5. Fasting glucose
    if glucose >= 100:
        criteria_met += 1

    return 'Yes' if criteria_met >= 3 else 'No'
    
df['metabolic_syndrome'] = df.apply(check_metabolic_syndrome, axis=1)


print(df[['features_list', 'metabolic_syndrome']].head())



                   features_list metabolic_syndrome
0    [81.3, 82, 73, 114, 73, 94]                 No
1  [81.0, 115, 42, 119, 70, 130]                 No
2   [80.0, 182, 55, 138, 86, 89]                 No
3   [88.0, 254, 45, 100, 60, 96]                 No
4    [86.0, 74, 62, 120, 74, 80]                 No


In [74]:
df.to_csv("smoking_clean", index=False)

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55692 entries, 0 to 55691
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     55692 non-null  int64  
 1   gender                 55692 non-null  object 
 2   age                    55692 non-null  int64  
 3   height(cm)             55692 non-null  int64  
 4   weight(kg)             55692 non-null  int64  
 5   waist(cm)              55692 non-null  float64
 6   eyesight(left)         55692 non-null  float64
 7   eyesight(right)        55692 non-null  float64
 8   hearing(left)          55692 non-null  int64  
 9   hearing(right)         55692 non-null  int64  
 10  systolic               55692 non-null  int64  
 11  relaxation             55692 non-null  int64  
 12  fasting blood sugar    55692 non-null  int64  
 13  Cholesterol            55692 non-null  int64  
 14  triglyceride           55692 non-null  int64  
 15  HD