In [7]:
# Load the data
import pandas as pd
import time
df = pd.read_csv('C:\ALL\McMaster M.Eng. Study\SEP 786 - Artificial Intelligence and Machine Learning Fundamentals (First Term)\ObesityDataSet_raw_and_data_sinthetic.csv')

# Check for missing values in the dataset
df.isnull().sum()

# Fill missing values with the mean
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS'])

# Convert the target variable to numerical using label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["NObeyesdad"] = le.fit_transform(df["NObeyesdad"])

# Split the data into train and test sets
from sklearn.model_selection import train_test_split
X = df.drop("NObeyesdad", axis=1)
y = df["NObeyesdad"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform PCA to reduce dimensionality of feature space
pca = PCA(n_components=0.95)
start_time_pca = time.time() # Start measuring computational time for PCA

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

end_time_pca = time.time() # End measuring computational time for PCA
pca_time = end_time_pca - start_time_pca
print("Time taken for PCA:", pca_time)


# Train a decision tree regression model on the PCA-transformed data
dtr = DecisionTreeRegressor(random_state=42)
start_time_train = time.time() # Start measuring computational time for training the model

dtr.fit(X_train_pca, y_train)

end_time_train = time.time() # End measuring computational time for training the model
train_time = end_time_train - start_time_train
print("Time taken for training:", train_time)

# Make predictions on the test set and evaluate the model
y_pred = dtr.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

Time taken for PCA: 0.0019419193267822266
Time taken for training: 0.032952070236206055
Mean squared error: 2.591016548463357


In [3]:
# Train a decision tree regression model on the scaled data
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train_scaled, y_train)

# Make predictions on the test set and convert them to binary values
y_pred = dtr.predict(X_test_scaled)
y_pred_binary = [1 if y >= 0.5 else 0 for y in y_pred]

# Evaluate the model using confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_binary)
print("Confusion matrix:\n", cm)


Confusion matrix:
 [[55  1  0  0  0  0  0]
 [ 4 58  0  0  0  0  0]
 [ 0 78  0  0  0  0  0]
 [ 0 58  0  0  0  0  0]
 [ 0 63  0  0  0  0  0]
 [ 0 56  0  0  0  0  0]
 [ 0 50  0  0  0  0  0]]


In [8]:
# Load the data
import pandas as pd
import time
df = pd.read_csv('C:\ALL\McMaster M.Eng. Study\SEP 786 - Artificial Intelligence and Machine Learning Fundamentals (First Term)\ObesityDataSet_raw_and_data_sinthetic.csv')

# Check for missing values in the dataset
df.isnull().sum()

# Fill missing values with the mean
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS'])

# Convert the target variable to numerical using label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["NObeyesdad"] = le.fit_transform(df["NObeyesdad"])

# Split the data into train and test sets
from sklearn.model_selection import train_test_split
X = df.drop("NObeyesdad", axis=1)
y = df["NObeyesdad"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform LDA to reduce dimensionality of feature space
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2) # set n_components as desired
start_time_lda = time.time() # Start measuring computational time for LDA

X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

end_time_lda = time.time() # End measuring computational time for LDA
lda_time = end_time_lda - start_time_lda
print("Time taken for LDA:", lda_time)

# Train a decision tree regression model on the LDA-transformed data
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=42)
start_time_train = time.time() # Start measuring computational time for training the model

dtr.fit(X_train_lda, y_train)

end_time_train = time.time() # End measuring computational time for training the model
train_time = end_time_train - start_time_train
print("Time taken for training:", train_time)

# Make predictions on the test set and evaluate the model
from sklearn.metrics import mean_squared_error
y_pred = dtr.predict(X_test_lda)
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)


Time taken for LDA: 0.006035804748535156
Time taken for training: 0.002058744430541992
Mean squared error: 0.8226950354609929


In [2]:
# Make predictions on the test set and evaluate the model
from sklearn.metrics import confusion_matrix
y_pred = lda.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

Confusion matrix:
 [[55  1  0  0  0  0  0]
 [14 38  0  0  0 10  0]
 [ 0  0 71  5  0  0  2]
 [ 0  0  0 58  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  2  0  0  0 47  7]
 [ 0  0  1  0  0  3 46]]
