In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

### Human Age Dataset

In [141]:
age = pd.read_csv('Human_age_pred/Train.csv')
age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Gender                       3000 non-null   object 
 1   Height (cm)                  3000 non-null   float64
 2   Weight (kg)                  3000 non-null   float64
 3   Blood Pressure (s/d)         3000 non-null   object 
 4   Cholesterol Level (mg/dL)    3000 non-null   float64
 5   BMI                          3000 non-null   float64
 6   Blood Glucose Level (mg/dL)  3000 non-null   float64
 7   Bone Density (g/cm²)         3000 non-null   float64
 8   Vision Sharpness             3000 non-null   float64
 9   Hearing Ability (dB)         3000 non-null   float64
 10  Physical Activity Level      3000 non-null   object 
 11  Smoking Status               3000 non-null   object 
 12  Alcohol Consumption          1799 non-null   object 
 13  Diet              

In [142]:
age.duplicated().sum()

0

In [143]:
age.rename(columns={'Age (years)':'Target'},inplace=True)

In [144]:
age[['Blood Pressure (sys)','Blood Pressure (dia)']] = age['Blood Pressure (s/d)'].str.split('/',expand=True)

In [145]:
age.drop(columns=['Blood Pressure (s/d)','Family History'], inplace=True)

In [146]:
cat_data = [feat for feat in age.columns if age[feat].dtype == "O"]
num_data = [feat for feat in age.columns if age[feat].dtype != "O"]

cat_data_labels = {i : {gen: val for gen,val in zip(age[i].value_counts().keys(),age[i].value_counts())} for i in cat_data if i !='Blood Pressure (s/d)'}
for i in cat_data_labels.items():
    print(i)

('Gender', {'Female': 1511, 'Male': 1489})
('Physical Activity Level', {'Moderate': 1407, 'Low': 902, 'High': 691})
('Smoking Status', {'Former': 1181, 'Never': 1026, 'Current': 793})
('Alcohol Consumption', {'Occasional': 1057, 'Frequent': 742})
('Diet', {'Balanced': 1183, 'High-fat': 662, 'Low-carb': 605, 'Vegetarian': 550})
('Chronic Diseases', {'Hypertension': 676, 'Diabetes': 532, 'Heart Disease': 493})
('Medication Use', {'Regular': 1063, 'Occasional': 739})
('Mental Health Status', {'Good': 1073, 'Fair': 1009, 'Poor': 479, 'Excellent': 439})
('Sleep Patterns', {'Normal': 1519, 'Insomnia': 1053, 'Excessive': 428})
('Education Level', {'Undergraduate': 884, 'High School': 883, 'Postgraduate': 606})
('Income Level', {'Medium': 1223, 'Low': 916, 'High': 861})
('Blood Pressure (sys)', {'137': 85, '149': 82, '134': 79, '147': 78, '140': 76, '144': 76, '136': 73, '151': 73, '146': 70, '142': 69, '152': 68, '135': 67, '154': 65, '141': 64, '148': 64, '139': 62, '143': 61, '159': 61, '13

In [147]:
age['Gender'].replace({'Female': '0', 'Male': '1'}, inplace=True)
age['Physical Activity Level'].replace({'High': '3','Moderate': '2','Low': '1'}, inplace=True)
age['Smoking Status'].replace({'Never': '0', 'Former': '1', 'Current': '2'}, inplace=True)
age[['Gender','Physical Activity Level', 'Smoking Status']] = age[['Gender','Physical Activity Level','Smoking Status']].astype(int)
age[['Chronic Diseases','Alcohol Consumption']] = age[['Chronic Diseases','Alcohol Consumption']].fillna(value='No')
age['Medication Use'] = age['Medication Use'].fillna(value='LessOrMayBeNo')
age['Mental Health Status'].replace({'Poor': 1, 'Fair': 2, 'Good': 3, 'Excellent': 4},inplace=True)
age['Education Level'].replace({np.nan:1,'High School': 2, 'Undergraduate': 3, 'Postgraduate': 4}, inplace=True)
age['Income Level'].replace({'Medium': 2, 'Low': 1, 'High': 3}, inplace=True)

In [148]:
age[['Mental Health Status','Education Level','Income Level','Blood Pressure (sys)','Blood Pressure (dia)']] = age[['Mental Health Status','Education Level','Income Level','Blood Pressure (sys)','Blood Pressure (dia)']].astype(int)

In [149]:
age = age[['Gender', 'Height (cm)', 'Weight (kg)', 'Cholesterol Level (mg/dL)',
       'BMI', 'Blood Glucose Level (mg/dL)', 'Bone Density (g/cm²)',
       'Vision Sharpness', 'Hearing Ability (dB)', 'Physical Activity Level',
       'Smoking Status', 'Alcohol Consumption', 'Diet', 'Chronic Diseases',
       'Medication Use', 'Cognitive Function', 'Mental Health Status',
       'Sleep Patterns', 'Stress Levels', 'Pollution Exposure', 'Sun Exposure',
       'Education Level', 'Income Level', 'Blood Pressure (sys)',
       'Blood Pressure (dia)', 'Target']]

In [150]:
age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Gender                       3000 non-null   int32  
 1   Height (cm)                  3000 non-null   float64
 2   Weight (kg)                  3000 non-null   float64
 3   Cholesterol Level (mg/dL)    3000 non-null   float64
 4   BMI                          3000 non-null   float64
 5   Blood Glucose Level (mg/dL)  3000 non-null   float64
 6   Bone Density (g/cm²)         3000 non-null   float64
 7   Vision Sharpness             3000 non-null   float64
 8   Hearing Ability (dB)         3000 non-null   float64
 9   Physical Activity Level      3000 non-null   int32  
 10  Smoking Status               3000 non-null   int32  
 11  Alcohol Consumption          3000 non-null   object 
 12  Diet                         3000 non-null   object 
 13  Chronic Diseases  

In [154]:
age.to_csv('D://files/My_ML_DL_Chatbot/data/preprocessed_data/regression/HumanAgePred.csv')

### Wine Quality Dataset

In [152]:
iris = pd.read_csv('data\classification\Original data\Breast_cancer.csv')
iris.head()
iris.shape
iris.isna().values.any()
iris.info()
credit['type_of_loan'].unique(), credit['type_of_loan'].nunique()
credit.rename(columns={'credit_score':'Target'}, inplace=True)
df = iris.drop(columns='Id')
df.duplicated().sum()
df.drop_duplicates(inplace=True)

payment_of_min_amount_mapping = {'Yes': 2,'No': 1}
credit['payment_of_min_amount'].replace(payment_of_min_amount_mapping, inplace=True)
credit[['credit_mix','payment_of_min_amount']] = credit[['credit_mix','payment_of_min_amount']].astype(int)

credit.to_csv('D://files/My_ML_DL_Chatbot/data/preprocessed_data/regression/CreditScore.csv')


https://www.kaggle.com/datasets/yasserh/wine-quality-dataset
https://www.kaggle.com/datasets/yasserh/titanic-dataset

SyntaxError: invalid syntax (4290392588.py, line 19)

### Human Age Dataset


In [178]:
age = pd.read_csv('data\classification\Original data\Human_age_pred\\train.csv')
age.head()

Unnamed: 0,Gender,Height (cm),Weight (kg),Blood Pressure (s/d),Cholesterol Level (mg/dL),BMI,Blood Glucose Level (mg/dL),Bone Density (g/cm²),Vision Sharpness,Hearing Ability (dB),...,Family History,Cognitive Function,Mental Health Status,Sleep Patterns,Stress Levels,Pollution Exposure,Sun Exposure,Education Level,Income Level,Age (years)
0,Male,171.148359,86.185197,151/109,259.465814,29.423017,157.652848,0.132868,0.2,58.786198,...,,44.059172,Good,Insomnia,2.797064,5.142344,7.108975,,Medium,89
1,Male,172.946206,79.641937,134/112,263.630292,26.626847,118.507805,0.629534,0.267312,54.63527,...,Heart Disease,45.312298,Good,Normal,9.33993,7.27272,3.918489,Undergraduate,Medium,77
2,Female,155.945488,49.167058,160/101,207.846206,20.217553,143.58755,0.473487,0.248667,54.564632,...,Hypertension,56.246991,Poor,Insomnia,9.234637,8.500386,5.393408,,Medium,70
3,Female,169.078298,56.017921,133/94,253.283779,19.59527,137.448581,1.184315,0.513818,79.722963,...,Hypertension,55.196092,Poor,Insomnia,4.693446,7.555511,2.745578,,Low,52
4,Female,163.758355,73.966304,170/106,236.119899,27.582078,145.328695,0.434562,0.306864,52.479469,...,,53.023379,Good,Normal,4.038537,9.429097,3.878435,Undergraduate,High,79


In [179]:
age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Gender                       3000 non-null   object 
 1   Height (cm)                  3000 non-null   float64
 2   Weight (kg)                  3000 non-null   float64
 3   Blood Pressure (s/d)         3000 non-null   object 
 4   Cholesterol Level (mg/dL)    3000 non-null   float64
 5   BMI                          3000 non-null   float64
 6   Blood Glucose Level (mg/dL)  3000 non-null   float64
 7   Bone Density (g/cm²)         3000 non-null   float64
 8   Vision Sharpness             3000 non-null   float64
 9   Hearing Ability (dB)         3000 non-null   float64
 10  Physical Activity Level      3000 non-null   object 
 11  Smoking Status               3000 non-null   object 
 12  Alcohol Consumption          1799 non-null   object 
 13  Diet              