In [1]:
import pandas as pd

DATASET_PATH = r'D:\StudySpace\Nam4_KyI\cap1_ai_feature\data\final_dataset_asian.csv'

# Define BMI ranges and exercise plans for asian people
bmi_categories = {
    (0, 16): ("Severe Thinness", 1),
    (16, 17): ("Mild Thinness", 2),
    (17, 18.5): ("Moderate Thinness", 3),
    (18.5, 23): ("Normal", 4),
    (23, 27.5): ("Overweight", 5),
    (27.5, 32.5): ("Obese", 6),
    (32.5, float('inf')): ("Severe Obese", 7),  # Use infinity for the upper bound
}

def categorize_bmi(bmi):
    """Categorizes BMI based on predefined ranges."""
    for (lower, upper), (category, plan) in bmi_categories.items():
        if lower <= bmi < upper:
            return category, plan

# Load the data
df = pd.read_csv(DATASET_PATH)

# Apply the function and unpack the results
df['BMIcase'], df['Exercise Recommendation Plan'] = zip(*df['BMI'].apply(categorize_bmi))

# print(df.head())

# # Save the updated data
df.to_csv(DATASET_PATH, index=False)

print(df.head()) 

       Weight    Height        BMI  Gender  Age     BMIcase  \
0   92.085190  1.760250  29.719488  Female   59       Obese   
1   61.089124  1.595499  23.997776  Female   25  Overweight   
2   82.454037  1.816538  24.987499  Female   50  Overweight   
3  101.713306  1.790696  31.720047  Female   62       Obese   
4   99.609527  1.969726  25.673756    Male   57  Overweight   

   Exercise Recommendation Plan  
0                             6  
1                             5  
2                             5  
3                             6  
4                             5  


## Data Augmentation

In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE

DATASET_PATH = r'D:\StudySpace\Nam4_KyI\cap1_ai_feature\data\final_dataset_asian.csv'

# Load the data
df = pd.read_csv(DATASET_PATH)
X = df[['Weight', 'Height', 'Age']]
y = df['Exercise Recommendation Plan']

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print(pd.Series(y_resampled).value_counts())
updated_df = pd.concat([X_resampled, y_resampled], axis=1)
# Save the updated data

# df.to_csv(DATASET_PATH, index=False)

print(updated_df.describe()) 

Exercise Recommendation Plan
6    1527
5    1527
4    1527
7    1527
1    1527
3    1527
2    1527
Name: count, dtype: int64
             Weight        Height           Age  Exercise Recommendation Plan
count  10689.000000  10689.000000  10689.000000                  10689.000000
mean      73.096537      1.792563     41.347554                      4.000000
std       21.167430      0.125538     13.746923                      2.000094
min       50.000000      1.390171     18.000000                      1.000000
25%       55.956581      1.731472     29.000000                      2.000000
50%       65.728249      1.805061     41.000000                      4.000000
75%       87.356168      1.891175     53.000000                      6.000000
max      160.000000      1.990000     65.000000                      7.000000


## Oversampling for asian dataset by using SMOTE

In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Đường dẫn đến tập dữ liệu gốc
DATASET_PATH = r'D:\StudySpace\Nam4_KyI\cap1_ai_feature\data\final_dataset_asian.csv'
OUTPUT_PATH = r'D:\StudySpace\Nam4_KyI\cap1_ai_feature\data\oversampled_dataset.csv'

# Tải tập dữ liệu
df = pd.read_csv(DATASET_PATH)

# Chọn các đặc trưng và nhãn mục tiêu
X = df[['Weight', 'Height', 'BMI', 'Gender', 'Age', 'BMIcase']]
y = df['Exercise Recommendation Plan']

# Transform the categorical columns into one-hot encoded columns
X_encoded = pd.get_dummies(X, columns=['Gender', 'BMIcase'], drop_first=True)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# Transform the resampled features back to DataFrame
X_resampled = pd.DataFrame(X_resampled, columns=X_encoded.columns)

# Process Gender column
X_resampled['Gender'] = X_resampled[['Gender_Male']].apply(lambda x: 'Male' if x[0] == 1 else 'Female', axis=1)
X_resampled.drop(columns=['Gender_Male'], inplace=True)

# Process the BMIcase columns
bmicase_columns = [col for col in X_resampled.columns if col.startswith('BMIcase_')]
X_resampled['BMIcase'] = X_resampled[bmicase_columns].idxmax(axis=1).apply(lambda x: x.split('_')[1])
X_resampled.drop(columns=bmicase_columns, inplace=True)

# Combine features and target into a single DataFrame
resampled_df = pd.concat([X_resampled, pd.Series(y_resampled, name='Exercise Recommendation Plan')], axis=1)

resampled_df.to_csv(OUTPUT_PATH, index=False)

print("Dataset after oversampling:")
print(resampled_df['Exercise Recommendation Plan'].value_counts())


  X_resampled['Gender'] = X_resampled[['Gender_Male']].apply(lambda x: 'Male' if x[0] == 1 else 'Female', axis=1)


Dataset after oversampling:
Exercise Recommendation Plan
6    1527
5    1527
4    1527
7    1527
1    1527
3    1527
2    1527
Name: count, dtype: int64


## Oversampling for european dataset by using SMOTE

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Đường dẫn đến tập dữ liệu gốc
DATASET_PATH = r'D:\StudySpace\Nam4_KyI\cap1_ai_feature\data\final_dataset.csv'
OUTPUT_PATH = r'D:\StudySpace\Nam4_KyI\cap1_ai_feature\data\balanced_dataset_european.csv'

df = pd.read_csv(DATASET_PATH)

X = df[['Weight', 'Height', 'BMI', 'Gender', 'Age', 'BMIcase']]
y = df['Exercise Recommendation Plan']

# Transform the categorical columns into one-hot encoded columns
X_encoded = pd.get_dummies(X, columns=['Gender', 'BMIcase'], drop_first=True)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# Transform the resampled features back to DataFrame
X_resampled = pd.DataFrame(X_resampled, columns=X_encoded.columns)

# Process Gender column
X_resampled['Gender'] = X_resampled[['Gender_Male']].apply(lambda x: 'Male' if x[0] == 1 else 'Female', axis=1)
X_resampled.drop(columns=['Gender_Male'], inplace=True)

# Process the BMIcase columns
bmicase_columns = [col for col in X_resampled.columns if col.startswith('BMIcase_')]
X_resampled['BMIcase'] = X_resampled[bmicase_columns].idxmax(axis=1).apply(lambda x: x.split('_')[1])
X_resampled.drop(columns=bmicase_columns, inplace=True)

# Combine features and target into a single DataFrame
resampled_df = pd.concat([X_resampled, pd.Series(y_resampled, name='Exercise Recommendation Plan')], axis=1)

resampled_df.to_csv(OUTPUT_PATH, index=False)

print("Dataset after oversampling:")
print(resampled_df['Exercise Recommendation Plan'].value_counts())


  X_resampled['Gender'] = X_resampled[['Gender_Male']].apply(lambda x: 'Male' if x[0] == 1 else 'Female', axis=1)


Dataset after oversampling:
Exercise Recommendation Plan
5    1467
4    1467
6    1467
7    1467
1    1467
3    1467
2    1467
Name: count, dtype: int64
