# Multi-Class Prediction of Obesity Risk

## Imports and Reading in data

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)
y = train['NObeyesdad']
X = train.drop('NObeyesdad', axis=1)
X_test = test
X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation


## Data Cleaning and Preprocessing

In [3]:
# check for missing values
print(X.isnull().sum())

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
dtype: int64


In [4]:
# process categorial variables
X = X.replace('Male', 0)
X = X.replace('Female', 1)
X_test = X_test.replace('Male', 0)
X_test = X_test.replace('Female', 1)
X.head()

# change yes to 1 in selected columns 
boolean_columns = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
for column in boolean_columns:
    X[column] = X[column].replace('yes', 1)
    X_test[column] = X_test[column].replace('yes', 1)
    X[column] = X[column].replace('no', 0)
    X_test[column] = X_test[column].replace('no', 0)

X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,24.443011,1.699998,81.66995,1,1,2.0,2.983297,Sometimes,0,2.763573,0,0.0,0.976473,Sometimes,Public_Transportation
1,1,18.0,1.56,57.0,1,1,2.0,3.0,Frequently,0,2.0,0,1.0,1.0,no,Automobile
2,1,18.0,1.71146,50.165754,1,1,1.880534,1.411685,Sometimes,0,1.910378,0,0.866045,1.673584,no,Public_Transportation
3,1,20.952737,1.71073,131.274851,1,1,3.0,3.0,Sometimes,0,1.674061,0,1.467863,0.780199,Sometimes,Public_Transportation
4,0,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,Sometimes,0,1.979848,0,1.967973,0.931721,Sometimes,Public_Transportation


In [5]:
# process y, change them into numbers 
from sklearn.preprocessing import LabelEncoder
output_le = LabelEncoder()
y = output_le.fit_transform(y)
y

array([6, 1, 0, ..., 3, 6, 3])

In [6]:
# normalizing the data 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

onehot = OneHotEncoder(handle_unknown='infrequent_if_exist')
cat_features = ['CAEC', 'CALC', 'MTRANS']
encoded = onehot.fit_transform(X[cat_features]).toarray()
X = X.drop(cat_features, axis=1)
X = np.concatenate((X, encoded), axis=1)
encoded_test = onehot.transform(X_test[cat_features]).toarray()
X_test = X_test.drop(cat_features, axis=1)
X_test = np.concatenate((X_test, encoded_test), axis=1)

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

## Generate Submission File

In [7]:
def submit(NObeyesdad, filename='submission.csv'):
    NObeyesdad = output_le.inverse_transform(NObeyesdad)
    submit = pd.DataFrame(NObeyesdad, index=np.arange(20758, 20758 + len(NObeyesdad)), columns=['NObeyesdad'])
    submit.to_csv("submissions/" + filename, index_label='id')

## Classic Machine Learning Models

### Random Forest

In [8]:
# create a random forest classifier 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_val)
accuracy_score(y_val, y_pred)

0.8928227360308285

### K Nearest Neighbors

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_val)
accuracy_score(y_val, y_pred)

0.7466281310211946

### Logistic Regression 

In [10]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_val)
accuracy_score(y_val, y_pred)

0.8670520231213873

## Neural Network

In [15]:
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# create our own dataloader
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
val_dataset = MyDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=100, shuffle=True)
test_dataset = MyDataset(X_test, np.zeros((len(X_test))))
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

In [1]:
# simple neural network
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(25, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 10)
        self.fc4 = nn.Linear(10, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.self.fc4(x)
        return x

model = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# train the model
epochs = 100
for epoch in range(epochs):
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
    print(loss)


NameError: name 'nn' is not defined

## Submission 1

In [None]:
# train model with random forest and generate all data to submit 

rf_model = RandomForestClassifier(n_estimators=1000)
rf_model.fit(X, y)
y_pred = rf_model.predict(X_test)
# submit(y_pred)