In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F68479%2F7609535%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240305%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240305T093817Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D353bba699d8dc5a016af5d89c8c4173d72180ae76645cc58a0c1e4a9b508683315cfe80614c2c95aa60a8b37f10fa5d8217a39d61166f30aceaee690f866705ba4f20e872f8261fbdc7e1d72804365d767b187b2b541736fe10a5e3bd02db1559b2e0abf7d169537768fadfc94fdb6ed635fae65651e4d5be9df20ad715cc69de4245245d2103f78e25e5177db07978f4cfcbc08c017de32b555d379689a345679ce5457334d3f2bdc041a975dd524ecbf039f83147ef345dd5c22090c82d88418f851081f07d33dd0146a149c3e1795704bfd47e4fdc3d74d4d3fd6edaa197e154226c2b6795f914dd98dd3f95d88165bbcc1b9a885f12d1c49b58199bd8ceb,obesity-or-cvd-risk-classifyregressorcluster:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4030196%2F7009925%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240305%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240305T093817Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D94b006ea5d622cd2bd3d163bf464e5a4befcaaada55b8078da7f68628ba709ae166dcd93dd85b91b5b839b0c2012e309080171a9753c8450baa55fb3cec55f0cef71da1861c348f3d3730b8fd42978b4b3d72ca6a626bceb069fe8c0efd15918e87e5bb3c263bc2ea941052ffcf768f1a9455f9d521a108cf66163e4717f41bb54b5140ff94cc7e68ca6a1b2fca9c75ce225889c5679acaaa77e71876e486f387f44dde73456227292b1dd5171820b17f26896907c249b6de229b20ea047ec3e69ba3e699d676f4a616d60edc7dcd5dd36069493faf2126475c205e59d7b3bf27c0d8207c952efc94d68b30479e1d7a41c14f940d5d181f7b46a7bc06c48be33'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset Description

- 'id' : id
- 'Gender' : Gender
- 'Age' : Age
- 'Height' : Height is in meter
- 'Weight' : Weight is between 39 to 165
- 'family_history_with_overweight' : family history with overweight
- 'FAVC' : Frequent consumption of high calorie food
- 'FCVC' : Frequency of consumption of vegetables
- 'NCP' : Number of main meals
- 'CAEC' : Consumption of food between meals
- 'SMOKE' : Smoke
- 'CH2O' : Consumption of water daily
- 'SCC' : Calories consumption monitoring
- 'FAF' : Physical activity frequency
- 'TUE' : Time using technology devices
- 'CALC' : Consumption of alcohol
- 'MTRANS' : Transportation used
- 'NObeyesdad' : (Target) Obesity


## NObesity values:

- Underweight Less than 18.5
- Normal 18.5 to 24.9
- Overweight 25.0 to 29.9
- Obesity I 30.0 to 34.9
- Obesity II 35.0 to 39.9
- Obesity III Higher than 40

# Table of Contents
1. [Import Libraries](#Libraries)
1. [Import Data](#Data)
1. [Exploratory Data Analysis](#EDA)
1. [Pre-Processing](#Processing)
1. [Model Building](#Modeling)

<a id='Libraries'></a>
# 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from scipy.stats import loguniform
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

<a id='Data'></a>
# 2. Import Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
original = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s4e2/sample_submission.csv')

In [None]:
# Checking the shape of data
print(train.shape)
print(test.shape)

In [None]:
# Quick Overview
display('Train',train.head())
display('Test', test.head())

## Compare Original & Train data

In [None]:
original.info()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# check for missing data
print('Train:\n')
print(f'{train.isnull().sum()}\n\n\n')

print('Test:\n')
print(test.isnull().sum())

In [None]:
# Check for duplicate data
print(train.duplicated().sum())
print(test.duplicated().sum())

<a id='EDA'></a>
# 3. Exploratory Data Analysis

In [None]:
def distribution(columnname):
    fig, ax = plt.subplots(figsize=(12,5))
    value_counts = train[columnname].value_counts()
    labels = value_counts.index.tolist()
    print(value_counts, '\n\n\n')

    wedges, texts, autotexts = ax.pie(
        value_counts, autopct='%1.1f%%',
        wedgeprops=dict(width=0.7), startangle=80, pctdistance=0.85)

    for label, text in zip(labels, texts):
        text.set_text(label)

    fig.suptitle(columnname, fontweight='bold')

    plt.show()

## Visualize each of the columns

In [None]:
# train data
distribution("NObeyesdad")

In [None]:
distribution('Gender')

In [None]:
distribution('family_history_with_overweight')

In [None]:
distribution('FAVC')

In [None]:
distribution('CAEC')

In [None]:
distribution('SMOKE')

In [None]:
distribution('SCC')

In [None]:
distribution('CALC')

In [None]:
distribution('MTRANS')

<a id='Processing'></a>
# 4. Pre-processing

In [None]:
train = pd.concat([train, original]).drop(['id'], axis=1).drop_duplicates()
test = test.drop(['id'], axis=1)

In [None]:
print(f'train data: {train.shape}')
print(f'test data: {test.shape}\n')

percentage = np.round(train.shape[0]/(train.shape[0]+test.shape[0]), 4)
print(f'train data consists of {percentage * 100}% of all observations')
print(f'test data consists of {(1-percentage)*100}% of all observations')

## Feature Engineering

In [None]:
y = train.pop('NObeyesdad')
y.head()

In [None]:
cat_cols = list(train.select_dtypes(exclude = np.number).columns)
num_cols = list(test.select_dtypes(include = np.number).columns)

print(cat_cols)
print(num_cols)

In [None]:
#train = pd.get_dummies(train, columns=cat_cols)
#test = pd.get_dummies(test, columns=cat_cols)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train, y,
    stratify = y,
    test_size = 0.3,
    random_state = 42
)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

<a id='Modeling'></a>
# 4. Model Building

## LightGBM & RandomSearch

In [None]:
param_grid = {
    "clf__learning_rate": loguniform(0.001, 0.1),
    "clf__n_estimators": np.arange(50, 500),
    "clf__max_depth": np.arange(3, 30, 3),

}

column_transformer = ColumnTransformer([
    ("scaler", StandardScaler(), num_cols),
    ("ohc_encoder", OneHotEncoder(handle_unknown='ignore'), cat_cols),
], remainder = "passthrough")

pipeline = Pipeline([
    ("preprocessor", column_transformer),
    ("clf", LGBMClassifier(random_state=42, objective='multiclass', num_class=4))
])


random_search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_grid,
    n_iter = 3,
    scoring="f1_macro",
    cv=2,
    verbose=-1,
    n_jobs=-1
)

def convert_seconds_to_hms(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return hours, minutes, seconds

start_time = time.time()
random_search.fit(X_train, y_train)
end_time = time.time()
hours, minutes, seconds = convert_seconds_to_hms(end_time - start_time)

print(f"Training time: {end_time - start_time} seconds")
print(f"{hours} hours, {minutes} minutes, {seconds} seconds")

In [None]:
y_pred_val = random_search.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print("Accuracy:", accuracy)

In [None]:
f1_macro_val = f1_score(y_val, y_pred_val, average = 'macro')
print("f1_score:", f1_macro_val)

## 4.2 Random Forest Model

In [None]:
# param_grid = {
#     "clf__n_estimators": np.arange(50, 500),
#     "clf__max_depth": np.arange(3, 30, 3),
#     "clf__min_samples_split": np.arange(2, 20),
#     "clf__min_samples_leaf": np.arange(1, 10),

# }

# column_transformer = ColumnTransformer([
#     ("scaler", StandardScaler(), num_cols),
#     ("ohc_encoder", OneHotEncoder(handle_unknown='ignore'), cat_cols),
# ], remainder = "passthrough")

# pipeline = Pipeline([
#     ("preprocessor", column_transformer),
#     ("clf", RandomForestClassifier(random_state=42))
# ])


# random_search = RandomizedSearchCV(
#     estimator = pipeline,
#     param_distributions = param_grid,
#     n_iter = 3,
#     scoring="f1_macro",
#     cv=5,
#     verbose=-1,
#     n_jobs=-1
# )

# def convert_seconds_to_hms(seconds):
#     hours = seconds // 3600
#     minutes = (seconds % 3600) // 60
#     seconds = seconds % 60
#     return hours, minutes, seconds

# start_time = time.time()
# random_search.fit(X_train, y_train)
# end_time = time.time()
# hours, minutes, seconds = convert_seconds_to_hms(end_time - start_time)

# print(f"Training time: {end_time - start_time} seconds")
# print(f"{hours} hours, {minutes} minutes, {seconds} seconds")

In [None]:
# y_pred_val = random_search.predict(X_val)
# accuracy = accuracy_score(y_val, y_pred_val)
# print("Accuracy:", accuracy)

In [None]:
preds = random_search.predict(test)
preds

In [None]:
submission['id'].head(1)

In [None]:
submission['NObeyesdad'] = preds
submission.to_csv("submission_ver_1.csv", index=False)