# title

In [12]:
import pandas as pd

df = pd.read_csv('tri_health_data.csv')
df.head(5)

Unnamed: 0,Date,Sleep,Activity Type,Activity Duration,TSS,Weight,HRV,Steps,Cals_burnt,Cals,Carbs,Fats,Protein,Sensation
0,22/05/2017,7.1,"Swim, Run",125.0,154,87.7,,13894.0,4600.0,3600.0,481.0,102.0,135.0,7.0
1,23/05/2017,6.3,Swim,45.0,78,,,12739.0,3459.0,3700.0,413.0,100.0,75.0,4.0
2,24/05/2017,6.7,,,0,,,3324.0,2629.0,2210.0,305.0,42.0,75.0,8.0
3,25/05/2017,7.6,"Swim, Gym",142.0,155,,,11532.0,4327.0,4580.0,392.0,89.0,175.0,7.0
4,26/05/2017,8.7,Run,68.0,60,,,10479.0,3363.0,3590.0,510.0,61.0,80.0,9.0


In [13]:
# 1. Drop the columns 'Fats', 'Carbs', and 'HRV'.
df.drop(columns=['Fats', 'Carbs', 'HRV'], inplace=True)

# 2. Forward-fill the missing values in the 'Weight' column.
df['Weight'].fillna(method='ffill', inplace=True)

# 3. Fill the missing values in the 'Protein' column with its median, or with 80 if the median is NaN.
protein_median = df['Protein'].median()
if pd.isna(protein_median):
    df['Protein'].fillna(value=80, inplace=True)
else:
    df['Protein'].fillna(value=protein_median, inplace=True)

# 4. Fill any missing values in the 'Cals' column with its 75th percentile value.
df['Cals'].fillna(value=df['Cals'].quantile(0.75), inplace=True)

# 5. Fill any missing values in the 'Sensation' column with its mean value.
df['Sensation'].fillna(value=df['Sensation'].mean(), inplace=True)

# 6. Identify and drop rows where 'Activity Duration' is NaN but 'Activity Type' is not NaN.
rows_to_drop = df[df['Activity Duration'].isna() & ~df['Activity Type'].isna()].index
rows_dropped_count = len(rows_to_drop)
df.drop(rows_to_drop, inplace=True)

rows_dropped_count

2

In [14]:
df

Unnamed: 0,Date,Sleep,Activity Type,Activity Duration,TSS,Weight,Steps,Cals_burnt,Cals,Protein,Sensation
0,22/05/2017,7.1,"Swim, Run",125.0,154,87.7,13894.0,4600.0,3600.0,135.0,7.0
1,23/05/2017,6.3,Swim,45.0,78,87.7,12739.0,3459.0,3700.0,75.0,4.0
2,24/05/2017,6.7,,,0,87.7,3324.0,2629.0,2210.0,75.0,8.0
3,25/05/2017,7.6,"Swim, Gym",142.0,155,87.7,11532.0,4327.0,4580.0,175.0,7.0
4,26/05/2017,8.7,Run,68.0,60,87.7,10479.0,3363.0,3590.0,80.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...
1029,16/03/2020,8.0,Swim,60.0,60,90.6,11490.0,3414.0,3250.0,75.0,10.0
1030,17/03/2020,8.1,Swim,60.0,67,90.6,11011.0,3405.0,2870.0,75.0,8.0
1031,18/03/2020,6.6,,,0,90.6,6475.0,2091.0,1790.0,80.0,6.0
1032,19/03/2020,8.4,"Swim, Run",60.0,68,90.6,11595.0,3543.0,3780.0,80.0,9.0


## Train Test Split

In [15]:
from sklearn.model_selection import train_test_split

# Splitting the data into features and target
X = df.drop(columns=['Cals', 'Date'])  # Removing 'Date' and target column 'Cals'
y = df['Cals']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train

Unnamed: 0,Sleep,Activity Type,Activity Duration,TSS,Weight,Steps,Cals_burnt,Protein,Sensation
625,8.9,"Run, Gym",108.0,66,90.1,13353.0,4873.0,80.0,8.000000
815,7.3,"Swim, Run",60.0,50,85.8,8491.0,2793.0,70.0,7.000000
634,7.6,,,0,90.1,6017.0,2178.0,75.0,7.000000
23,8.8,,,0,86.6,4785.0,2160.0,80.0,8.000000
180,8.5,Bike,180.0,163,87.5,3883.0,5361.0,80.0,9.000000
...,...,...,...,...,...,...,...,...,...
550,8.3,Run,60.0,87,87.8,13735.0,3957.0,75.0,4.000000
185,7.7,Swim,62.0,63,87.8,13888.0,3693.0,80.0,8.000000
420,6.4,"Swim, Run",60.0,64,84.2,9421.0,2883.0,75.0,5.000000
369,6.6,Bike,240.0,298,87.5,6018.0,5613.0,80.0,6.850159


In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder

# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()



# Creating transformers
transformers = [
    ('robust', RobustScaler(), numeric_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols)
]

# Building the column transformer
preprocessor = ColumnTransformer(transformers)

# Displaying the categorical columns for reference
#categorical_cols
preprocessor

In [24]:

import pandas as pd

# 1. Fit the preprocessor to X_train
preprocessor.fit(X_train)

# 2. Transform X_train and X_test
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# 3. Convert the transformed arrays back to dataframes

# Getting feature names after transformation to set as columns in the dataframe
columns_after_transformation = (preprocessor.named_transformers_['robust'].get_feature_names_out(numeric_cols).tolist() +
                                preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist())

X_train_df = pd.DataFrame(X_train_transformed, columns=columns_after_transformation)
X_test_df = pd.DataFrame(X_test_transformed, columns=columns_after_transformation)

X_train_df, X_test_df


ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [21]:
import seaborn as sns

# Heatmap of pairwise correlations
correlation_matrix = X_train[numeric_cols].corr()
column_names = correlation_matrix.columns
sns.heatmap(correlation_matrix, xticklabels=column_names, yticklabels=column_names,cmap= "bwr");

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# Instantiate the model
model = LinearRegression()

# Train the model on the Training data
model.fit(X_train, y_train)

# Score the model on the Test data
model.score(X_test,y_test)