In [None]:
import pandas as pd
import numpy as np
import os
import sys
import argparse
import logging
from ydata_profiling import ProfileReport
from IPython.display import display, HTML
from datetime import timedelta
import lightgbm as lgb

### Ingest Data from Csv

In [None]:
df = pd.read_csv('../data/raw.csv')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
# # Convert columns to numeric where appropriate
# numeric_cols = []
# df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# # Convert timeStamp to datetime
# df['timeStamp'] = pd.to_datetime(df['timeStamp'], errors='coerce')

# # If tripID and deviceID are categorical, you can convert them as well


In [None]:
df = df.drop_duplicates().dropna()

In [None]:
end_date = pd.Timestamp.today()
start_date = end_date - pd.DateOffset(months=6)
last_15_days = end_date - timedelta(days=15)

df['timestamp'] = pd.to_datetime(
    np.random.uniform(start_date.value, end_date.value, size=len(df))
).floor('s')

df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['weekday'] = df['timestamp'].dt.weekday
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year


In [None]:
report = ProfileReport(df,
                title='Feature Profiling',
                infer_dtypes=False,
                interactions=None,
                missing_diagrams=None,
                correlations={"auto": {"calculate": False},
                            "pearson": {"calculate": True},
                            "spearman": {"calculate": True}})

In [None]:
display(HTML(report.html))

### Fix skewness in features
Log transformation compresses the range of the data and can help reduce right skewness by bringing the long tail closer to the bulk of the data.

Use Case: Right-skewed data where all values are positive.

In [None]:
for col in ['Insulin','Leptin','Resistin','MCP.1']:
    df[col] = np.log1p(df[col])

### Clipping Outliers

In [None]:
for col in df.columns:
    lower, upper = df[col].quantile([0.01,0.99])
    df[col] = df[col].clip(lower, upper)

### Derive new features

In [None]:
df['BMI_HOMA']     = df['BMI'] * df['HOMA']
df['L_A_ratio']   = df['Leptin'] / df['Adiponectin']

### Traget Normalisation

In [None]:
df['Classification'] = np.where(df['Classification'] > 1.5, 1, 0)
df = df.rename(columns={'Classification': 'Target'})

### Keeping out data from last 15 days for Predictions 

In [None]:
pred = df[df['timestamp'] >= last_15_days]
df = df.drop(pred.index)

### Remove timestamp

In [None]:
df = df.drop(columns=['timestamp'])
pred = pred.drop(columns=['timestamp'])

### Export transformed data 

In [None]:
df.to_csv('../data/clean.csv', index=False)
pred.to_csv('../data/pred.csv', index=False)