In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



Attributes related with eating habits are:
Frequent consumption of high caloric food (FAVC), \\

Frequency of consumption of vegetables (FCVC), \\

Number of main meals (NCP), \\

Consumption of food between meals (CAEC), \\

Consumption of water daily (CH20), \\

and Consumption of alcohol (CALC). \\

Attributes related with the physical condition are: \\
Calories consumption monitoring (SCC), \\

Physical activity frequency (FAF), \\

Time using technology devices (TUE), \\

Transportation used (MTRANS), \\



In [3]:
df_ml = pd.read_csv('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/ObesityDataSetUCI.csv')

In [4]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

## Preprocessing

In [6]:
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']
for col in categorical_columns:
    print(f"{col}:")
    print(df_ml[col].value_counts())
    print()

Gender:
Male      1068
Female    1043
Name: Gender, dtype: int64

family_history_with_overweight:
yes    1726
no      385
Name: family_history_with_overweight, dtype: int64

FAVC:
yes    1866
no      245
Name: FAVC, dtype: int64

CAEC:
Sometimes     1765
Frequently     242
Always          53
no              51
Name: CAEC, dtype: int64

SMOKE:
no     2067
yes      44
Name: SMOKE, dtype: int64

SCC:
no     2015
yes      96
Name: SCC, dtype: int64

CALC:
Sometimes     1401
no             639
Frequently      70
Always           1
Name: CALC, dtype: int64

MTRANS:
Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: MTRANS, dtype: int64

NObeyesdad:
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: NObeyesdad, dtype: int64



1) combine all obesity classese into Obese. combine the remaining class as not obese

In [7]:
df_ml['NObeyesdad'] = df_ml['NObeyesdad'].replace(['Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'], 'Obese')
df_ml['NObeyesdad'] = df_ml['NObeyesdad'].replace(['Overweight_Level_I', 'Overweight_Level_II', 'Normal_Weight', 'Insufficient_Weight'], 'Not Obese')

Check the class distribution for the outcome variable again

In [8]:
df_ml['NObeyesdad'].value_counts()

Not Obese    1139
Obese         972
Name: NObeyesdad, dtype: int64

2) Note that the CALC alcohol consumption has Always class with only 1 variable.
1 variable is insufficient to learn about the its relation with obesity. \\

We will combine the rows together.

We can assume that respondents who answer "Frequently" or "Always" have a similar level of engagement with the behavior being measured, so combining the two categories is unlikely to introduce bias or affect the interpretation of the results.

In [9]:
# Replace "Always" with "Frequently" in the CALC column
df_ml['CALC'] = df_ml['CALC'].replace('Always', 'Frequently/Always')

# Group the "Frequently" and "Frequently/Always" categories
df_ml['CALC'] = df_ml['CALC'].replace('Frequently', 'Frequently/Always')

In [10]:
# Check the new alcohol column now
df_ml['CALC'].value_counts()

Sometimes            1401
no                    639
Frequently/Always      71
Name: CALC, dtype: int64

3) Check for duplicates row

In [17]:
# Check for duplicate rows in the DataFrame
duplicate_counts = df_ml[df_ml.duplicated()].groupby("NObeyesdad")["NObeyesdad"].count()

# Print the duplicate rows, if any
if len(duplicate_counts) > 0:
    print("Duplicate rows found!")
else:
    print("No duplicate rows found.")
print(duplicate_counts)

# Duplicates come from the majority class.

Duplicate rows found!
NObeyesdad
Not Obese    24
Name: NObeyesdad, dtype: int64


4) Drop the weight variable.  \\
Not dropping the weight variable will make the machine learning model redundant !

In [18]:
df_ml = df_ml.drop('Weight', axis = 1)

In [19]:
# Check again 
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   family_history_with_overweight  2111 non-null   object 
 4   FAVC                            2111 non-null   object 
 5   FCVC                            2111 non-null   float64
 6   NCP                             2111 non-null   float64
 7   CAEC                            2111 non-null   object 
 8   SMOKE                           2111 non-null   object 
 9   CH2O                            2111 non-null   float64
 10  SCC                             2111 non-null   object 
 11  FAF                             2111 non-null   float64
 12  TUE                             21

In [20]:
df_ml.head(n = 5)

Unnamed: 0,Gender,Age,Height,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Not Obese
1,Female,21.0,1.52,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Not Obese
2,Male,23.0,1.8,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently/Always,Public_Transportation,Not Obese
3,Male,27.0,1.8,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently/Always,Walking,Not Obese
4,Male,22.0,1.78,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Not Obese


5) SMOTE Analysis forgets to round off the catogory for FCVC, NCP, CH20, FAF, TUE?

We shall round them off.

In [32]:
# Round them off
df_ml['FCVC'] = df_ml['FCVC'].round()
df_ml['NCP'] = df_ml['NCP'].round()
df_ml['CH2O'] = df_ml['CH2O'].round()
df_ml['FAF'] = df_ml['FAF'].round()
df_ml['TUE'] = df_ml['TUE'].round()

In [34]:
# We wil then convert them to categorical variable
df_ml[['FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']] = df_ml[['FCVC',
                                                      'NCP',
                                                      'CH2O',
                                                      'FAF',
                                                      'TUE']].astype('object')

In [35]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   family_history_with_overweight  2111 non-null   object 
 4   FAVC                            2111 non-null   object 
 5   FCVC                            2111 non-null   object 
 6   NCP                             2111 non-null   object 
 7   CAEC                            2111 non-null   object 
 8   SMOKE                           2111 non-null   object 
 9   CH2O                            2111 non-null   object 
 10  SCC                             2111 non-null   object 
 11  FAF                             2111 non-null   object 
 12  TUE                             21

6) Apply label encoder instead of one-hot encoder to reduce the dimension increase since our dataset is quite small.

In [36]:
from sklearn.preprocessing import LabelEncoder

def encode_dataframe(df):
    """
    Automatically encode a dataframe that has both continuous and categorical features.
    Uses a combination of Label Encoding and One-Hot Encoding.
    Returns the encoded dataframe.
    """
    # separate the categorical and numerical features
    categorical_features = df.select_dtypes(include=['object', "bool"]).columns
    
    # encode categorical features with Label Encoding
    label_encoder = LabelEncoder()
    for feature in categorical_features:
        df[feature] = label_encoder.fit_transform(df[feature])
    
    return df


In [37]:
df_encoded = df_ml.copy()
# Apply Label encoding
df_encoded = encode_dataframe(df_encoded)

In [39]:
# Check the dataset again
df_encoded.head(n = 5)

Unnamed: 0,Gender,Age,Height,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.0,1.62,1,0,1,2,2,0,1,0,0,1,2,3,0
1,0,21.0,1.52,1,0,2,2,2,1,2,1,3,0,1,3,0
2,1,23.0,1.8,1,0,1,2,2,0,1,0,2,1,0,3,0
3,1,27.0,1.8,0,0,2,2,2,0,1,0,2,0,0,4,0
4,1,22.0,1.78,0,0,1,0,2,0,1,0,0,0,1,3,0


In [50]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   int64  
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   family_history_with_overweight  2111 non-null   int64  
 4   FAVC                            2111 non-null   int64  
 5   FCVC                            2111 non-null   int64  
 6   NCP                             2111 non-null   int64  
 7   CAEC                            2111 non-null   int64  
 8   SMOKE                           2111 non-null   int64  
 9   CH2O                            2111 non-null   int64  
 10  SCC                             2111 non-null   int64  
 11  FAF                             2111 non-null   int64  
 12  TUE                             21

In [101]:
# Assign X and Y
X = df_encoded[['Gender', 'Age', 'Height', 'family_history_with_overweight', 'FAVC',
        'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC','MTRANS']]
y = df_encoded['NObeyesdad']

6) Split into training and test dataset

In [102]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [103]:
X_train.head(n = 5)

Unnamed: 0,Gender,Age,Height,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
1930,0,25.982113,1.627818,1,1,2,2,2,0,0,0,0,1,1,3
1803,0,26.0,1.65632,1,1,2,2,2,0,2,0,0,0,1,3
1198,1,18.0,1.750097,1,1,2,2,2,0,2,0,1,0,1,3
15,0,22.0,1.7,1,0,2,2,0,0,1,1,2,1,1,3
816,1,23.451595,1.670227,1,1,1,2,2,0,1,0,0,2,1,3


In [104]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 634 entries, 544 to 1523
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          634 non-null    int64  
 1   Age                             634 non-null    float64
 2   Height                          634 non-null    float64
 3   family_history_with_overweight  634 non-null    int64  
 4   FAVC                            634 non-null    int64  
 5   FCVC                            634 non-null    int64  
 6   NCP                             634 non-null    int64  
 7   CAEC                            634 non-null    int64  
 8   SMOKE                           634 non-null    int64  
 9   CH2O                            634 non-null    int64  
 10  SCC                             634 non-null    int64  
 11  FAF                             634 non-null    int64  
 12  TUE                             6

In [105]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 1477 entries, 1930 to 860
Series name: NObeyesdad
Non-Null Count  Dtype
--------------  -----
1477 non-null   int64
dtypes: int64(1)
memory usage: 23.1 KB


In [106]:
y_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 634 entries, 544 to 1523
Series name: NObeyesdad
Non-Null Count  Dtype
--------------  -----
634 non-null    int64
dtypes: int64(1)
memory usage: 9.9 KB


## Feature Selection Label Encoded Version

Filter into numerical and Categorical variables

In [107]:
# Select the numerical columns
num_cols = ['Age','Height']
df_num_train = X_train[num_cols]

# Select the categorical columns
cat_cols = list(set(X_train.columns) - set(num_cols))
df_cat_train = X_train[cat_cols]

Numerical variables

In [108]:
# calculate the pairwise correlation between age and height
corr = df_num_train['Age'].corr(df_num_train['Height'])

# print the correlation coefficient
print('The pairwise correlation between age and height is:', corr)

The pairwise correlation between age and height is: -0.02576323637685145


Age and Height effect caused by puberty spurt is insignificant since most people will have stop increasing their height by the time they reached 18 years old.

In [109]:
from sklearn.feature_selection import mutual_info_classif

# Compute mutual information between each feature in X and y
mutual_info = mutual_info_classif(df_num_train, y_train, random_state=42)

# Create a DataFrame to store the mutual information scores for each feature
mi_df = pd.DataFrame({'feature': num_cols, 'mi_score': mutual_info})

# Sort the features by their mutual information scores in descending order
mi_df = mi_df.sort_values(by='mi_score', ascending=False)

# Print the mutual information scores for each feature
print(mi_df)

  feature  mi_score
0     Age  0.185014
1  Height  0.135868


Categorical variables

Mutual information gain

In [110]:
from sklearn.feature_selection import mutual_info_classif

# Compute mutual information between each feature in X and y
mutual_info1 = mutual_info_classif(df_cat_train, y_train, random_state=42)

# Create a DataFrame to store the mutual information scores for each feature
mi_df1 = pd.DataFrame({'feature': cat_cols, 'mi_score': mutual_info1})

# Sort the features by their mutual information scores in descending order
mi_df1 = mi_df1.sort_values(by='mi_score', ascending=False)

# Print the mutual information scores for each feature
print(mi_df1)

                           feature  mi_score
9   family_history_with_overweight  0.120220
7                              NCP  0.059662
12                            CAEC  0.053121
10                            FAVC  0.045951
8                              SCC  0.033531
6                             FCVC  0.017746
1                             CH2O  0.009208
2                              FAF  0.008436
0                            SMOKE  0.007469
3                             CALC  0.000000
4                           MTRANS  0.000000
5                           Gender  0.000000
11                             TUE  0.000000


Chi square test

In [111]:
from sklearn.feature_selection import chi2, SelectKBest

# feature selection
def chi2_select_features(X_train, y_train):
  # Set seed to 42
  np.random.seed(42)
  fs = SelectKBest(score_func=chi2, k='all')
  fs.fit(X_train, y_train)
  return fs

def chi2_transform(X_train, X_test, fs):
  X_train_fs = fs.transform(X_train)
  X_test_fs = fs.transform(X_test)
  return X_train_fs, X_test_fs

fs = chi2_select_features(df_cat_train,y_train)

In [112]:
# Create dataframe to store results
results_df = pd.DataFrame(columns=['Feature', 'Chi-Square Score'])

# Loop through each feature and store score
for i in range(len(df_cat_train.columns)):
    feature = df_cat_train.columns[i]
    score = fs.scores_[i]
    results_df = pd.concat([results_df,
                            pd.DataFrame({'Feature': [feature],
                                                      'Chi-Square Score': [score]})])

# Sort results dataframe by Score column in descending order
results_df = results_df.sort_values(by='Chi-Square Score', ascending=False)

# Reset the index without adding the old index as a column
results_df = results_df.reset_index(drop=True)

# Print results dataframe
results_df

Unnamed: 0,Feature,Chi-Square Score
0,SCC,48.004206
1,family_history_with_overweight,44.030976
2,FAF,29.859767
3,FAVC,10.84112
4,CAEC,9.379805
5,FCVC,6.86599
6,CH2O,3.264783
7,TUE,2.460817
8,CALC,1.676816
9,NCP,1.037733


In [113]:
# We will drop the gender column from the x_train and x_test dataset
# Gender perform quite badly for the both test, Gender will be dropped
# There are some association between FCVC with binary outcome, it will not be dropped
import pandas as pd

X_train.drop("Gender", axis=1, inplace=True)
X_test.drop("Gender", axis=1, inplace=True)

In [114]:
import os
# Export this x_train and x_test dataset as x_train_label and x_test_label
X_train.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'x_train_label.csv'), index=False)
X_test.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'x_test_label.csv'), index=False)

# Export this y_train and x_test dataset as y_train_label and y_test_label respectively
y_train.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'y_train_label.csv'), index=False)
y_test.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'y_test_label.csv'), index=False)

## Preprocessing for One Hot encoded version

In [116]:
# Assign X1 and Y1
X1 = df_ml[['Gender', 'Age', 'Height', 'family_history_with_overweight', 'FAVC',
        'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC',
        'MTRANS']]
y1 = df_ml['NObeyesdad']

In [117]:
# get the list of categorical columns
cat_cols1 = X1.select_dtypes(include=['object']).columns.tolist()

# perform one-hot encoding on the categorical columns
X_onehot = pd.get_dummies(X1, columns=cat_cols1)

# display the first few rows of the encoded dataset
X_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 41 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 2111 non-null   float64
 1   Height                              2111 non-null   float64
 2   Gender_Female                       2111 non-null   uint8  
 3   Gender_Male                         2111 non-null   uint8  
 4   family_history_with_overweight_no   2111 non-null   uint8  
 5   family_history_with_overweight_yes  2111 non-null   uint8  
 6   FAVC_no                             2111 non-null   uint8  
 7   FAVC_yes                            2111 non-null   uint8  
 8   FCVC_1.0                            2111 non-null   uint8  
 9   FCVC_2.0                            2111 non-null   uint8  
 10  FCVC_3.0                            2111 non-null   uint8  
 11  NCP_1.0                             2111 no

  X_onehot = pd.get_dummies(X1, columns=cat_cols1)
  X_onehot = pd.get_dummies(X1, columns=cat_cols1)
  X_onehot = pd.get_dummies(X1, columns=cat_cols1)
  X_onehot = pd.get_dummies(X1, columns=cat_cols1)
  X_onehot = pd.get_dummies(X1, columns=cat_cols1)


Split into train and test set

In [133]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_onehot, y1, test_size=0.3,
                                                    random_state=42)

## Feature Selection for One Hot Version

Split again into numerical and categorical variable

In [134]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1477 entries, 1930 to 860
Data columns (total 41 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 1477 non-null   float64
 1   Height                              1477 non-null   float64
 2   Gender_Female                       1477 non-null   uint8  
 3   Gender_Male                         1477 non-null   uint8  
 4   family_history_with_overweight_no   1477 non-null   uint8  
 5   family_history_with_overweight_yes  1477 non-null   uint8  
 6   FAVC_no                             1477 non-null   uint8  
 7   FAVC_yes                            1477 non-null   uint8  
 8   FCVC_1.0                            1477 non-null   uint8  
 9   FCVC_2.0                            1477 non-null   uint8  
 10  FCVC_3.0                            1477 non-null   uint8  
 11  NCP_1.0                             1477 

In [135]:
# Select the numerical columns
num_cols2 = ['Age','Height']
df_num_train1 = X_train1[num_cols]

# Select the categorical columns
cat_cols2 = list(set(X_train1.columns) - set(num_cols2))
df_cat_train1 = X_train1[cat_cols2]

Numerical variables analysis

In [136]:
# calculate the pairwise correlation between age and height
corr = df_num_train1['Age'].corr(df_num_train1['Height'])

# print the correlation coefficient
print('The pairwise correlation between age and height is:', corr)

The pairwise correlation between age and height is: -0.02576323637685145


Mutual information

In [137]:
from sklearn.feature_selection import mutual_info_classif

# Compute mutual information between each feature in X and y
mutual_info2 = mutual_info_classif(df_num_train1, y_train1, random_state=42)

# Create a DataFrame to store the mutual information scores for each feature
mi_df2 = pd.DataFrame({'feature': num_cols2, 'mi_score': mutual_info2})

# Sort the features by their mutual information scores in descending order
mi_df2 = mi_df2.sort_values(by='mi_score', ascending=False)

# Print the mutual information scores for each feature
print(mi_df2)

  feature  mi_score
0     Age  0.185014
1  Height  0.135868


Categorical variables analysis

Mutual information

In [146]:
from sklearn.feature_selection import mutual_info_classif

# Compute mutual information between each feature in X and y
mutual_info3 = mutual_info_classif(df_cat_train1, y_train, random_state=42)

# Create a DataFrame to store the mutual information scores for each feature
mi_df3 = pd.DataFrame({'feature': cat_cols2, 'mi_score': mutual_info3})

# Sort the features by their mutual information scores in descending order
mi_df3 = mi_df3.sort_values(by='mi_score', ascending=False)

# Print the mutual information scores for each feature
mi_df3

Unnamed: 0,feature,mi_score
27,family_history_with_overweight_no,0.130361
13,family_history_with_overweight_yes,0.103384
16,CAEC_Sometimes,0.083168
22,CAEC_Frequently,0.071161
6,FAVC_yes,0.055995
0,FAVC_no,0.041263
21,NCP_4.0,0.040316
11,MTRANS_Public_Transportation,0.034193
34,MTRANS_Automobile,0.033622
18,SCC_yes,0.033307


Chi square analysis Something is wrong with the code.:(

In [139]:
from sklearn.feature_selection import chi2, SelectKBest

# feature selection
def chi2_select_features(X_train, y_train):
  # Set seed to 42
  np.random.seed(42)
  fs = SelectKBest(score_func=chi2, k='all')
  fs.fit(X_train, y_train)
  return fs

def chi2_transform(X_train, X_test, fs):
  X_train_fs = fs.transform(X_train)
  X_test_fs = fs.transform(X_test)
  return X_train_fs, X_test_fs

fs1 = chi2_select_features(df_cat_train1,y_train1)

In [140]:
y_train1.info()

<class 'pandas.core.series.Series'>
Int64Index: 1477 entries, 1930 to 860
Series name: NObeyesdad
Non-Null Count  Dtype 
--------------  ----- 
1477 non-null   object
dtypes: object(1)
memory usage: 23.1+ KB


In [145]:
from scipy.stats import chi2_contingency

# Empty dictionary to store results
results_dict = {}

# Loop through categorical columns
for col in cat_cols2:
    # Create contingency table
    contingency_table = pd.crosstab(df_cat_train1[col], y_train)
    # Compute chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    # Store results in dictionary
    results_dict[col] = {'chi2': chi2, 'p-value': p}

# Convert dictionary to dataframe
results_df = pd.DataFrame.from_dict(results_dict, orient='index')

# Sort by chi2 in descending order
results_df = results_df.sort_values(by='chi2', ascending=False)
# Drop p-value column
results_df = results_df.drop('p-value', axis=1)

# Display sorted dataframe
results_df

Unnamed: 0,chi2
family_history_with_overweight_no,246.072874
family_history_with_overweight_yes,246.072874
CAEC_Sometimes,198.448676
CAEC_Frequently,145.877467
FAVC_no,97.792639
FAVC_yes,97.792639
NCP_3.0,77.313783
NCP_4.0,73.17468
SCC_no,48.379385
SCC_yes,48.379385


In [142]:
# Drop the following rows SMOKE_no, FAF_1.0, MTRANS_Motorcycle, MTRANS_Bike, SMOKE_yes 
X_train1 = X_train1.drop(['SMOKE_no','FAF_1.0',
                          'MTRANS_Motorbike', 'MTRANS_Bike',
                          'SMOKE_yes'], axis=1)

In [143]:
X_test1 = X_test1.drop(['SMOKE_no','FAF_1.0',
                          'MTRANS_Motorbike', 'MTRANS_Bike',
                          'SMOKE_yes'], axis=1)

In [144]:
import os
# Export this x_train and x_test dataset as x_train_label and x_test_label
X_train1.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'x_train_onehot.csv'), index=False)
X_test1.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'x_test_onehot.csv'), index=False)

# Export this y_train and x_test dataset as y_train_label and y_test_label respectively
y_train1.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'y_train_onehot.csv'), index=False)
y_test1.to_csv(os.path.join('/content/drive/MyDrive/NUS/2022 2023 Sem 2/ST4248 Term Paper/Dataset/', 'y_test_onehot.csv'), index=False)

Version 3 of feature preprocessing