### Import required libraries

In [1]:
import pandas  as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

### Preprocessing

In [2]:
!ls

8b9aafccf7da11ee.zip  sample_submission.csv
dataset		      sample_submission_random_forest_gridsearchcv.csv
lupin.ipynb


In [3]:
!unzip 8b9aafccf7da11ee.zip

Archive:  8b9aafccf7da11ee.zip
replace dataset/sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [4]:
df = pd.read_csv('./dataset/train.csv')

In [5]:
df.head()

Unnamed: 0,candidate_id,gender,age,height_in_cm,weight_in_lbs,left_eyesight_lvl,right_eyesight_lvl,can_hear_left_ear,can_hear_right_ear,blood_pressure_lvl1,...,bad_cholestrol_lvl,hemoglobin_lvl,urea_lvl,creatinine_lvl,liver_enzyme_lvl1,liver_enzyme_lvl2,smoking_habit,drinking_habit,residential_area,triglyceride_lvl
0,CAN_1,Male,35,170,165.35,1.0,1.0,Slightly Defective,Slightly Defective,120.0,...,126.0,17.1,1.0,1.0,21.0,35.0,Does not Smoke,Y,Town,92.0
1,CAN_2,Male,30,180,176.37,0.9,1.2,Slightly Defective,Slightly Defective,130.0,...,148.0,15.8,1.0,0.9,20.0,36.0,Chain Smoker,N,Urban,121.0
2,CAN_3,Male,40,165,165.35,1.2,1.5,Slightly Defective,Slightly Defective,120.0,...,74.0,15.8,1.0,0.9,47.0,32.0,Does not Smoke,N,Rural,104.0
3,CAN_4,Male,50,175,176.37,1.5,1.2,Slightly Defective,Slightly Defective,145.0,...,104.0,17.6,1.0,1.1,29.0,34.0,Does not Smoke,N,Town,106.0
4,CAN_5,Male,50,165,132.28,1.0,1.2,Slightly Defective,Slightly Defective,138.0,...,117.0,13.8,1.0,0.8,19.0,12.0,Does not Smoke,N,Urban,104.0


In [6]:
df['can_hear_right_ear'].unique()

array(['Slightly Defective', 'Highly Defective'], dtype=object)

In [7]:
df.describe()

Unnamed: 0,age,height_in_cm,weight_in_lbs,left_eyesight_lvl,right_eyesight_lvl,blood_pressure_lvl1,blood_pressure_lvl2,glucose_lvl,total_cholestrol,good_cholestrol_lvl,bad_cholestrol_lvl,hemoglobin_lvl,urea_lvl,creatinine_lvl,liver_enzyme_lvl1,liver_enzyme_lvl2,triglyceride_lvl
count,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,22400.0,20850.0,22400.0
mean,47.586607,162.689286,139.644081,0.982098,0.978196,122.561473,76.120491,100.578304,195.399554,57.21625,113.063616,14.239942,1.097321,0.859469,25.824598,25.636259,130.173438
std,14.167025,9.710639,27.550786,0.613311,0.5901,14.581722,9.921338,24.334669,39.042778,55.983373,47.987373,1.56357,0.445315,0.292106,14.96867,20.521723,84.361161
min,20.0,125.0,66.14,0.1,0.1,75.0,42.0,51.0,54.0,4.0,1.0,8.1,1.0,0.1,4.0,2.0,1.0
25%,35.0,155.0,121.25,0.7,0.7,112.0,70.0,88.0,169.0,46.0,89.0,13.2,1.0,0.7,19.0,15.0,73.0
50%,45.0,165.0,132.28,1.0,1.0,121.0,76.0,96.0,193.0,55.0,111.0,14.3,1.0,0.8,23.0,20.0,107.0
75%,60.0,170.0,154.32,1.2,1.2,131.0,82.0,105.0,219.0,66.0,135.0,15.4,1.0,1.0,28.0,30.0,159.0
max,85.0,200.0,286.6,9.9,9.9,230.0,160.0,468.0,1619.0,8110.0,5119.0,18.0,6.0,16.4,911.0,659.0,500.0


In [8]:
df['smoking_habit'].unique()

array(['Does not Smoke', 'Chain Smoker', nan, 'Occassionaly Smokes'],
      dtype=object)

In [9]:
df['residential_area'].unique()

array(['Town', 'Urban', 'Rural'], dtype=object)

In [10]:
df['drinking_habit'].unique()

array(['Y', 'N', nan], dtype=object)

In [11]:
len(df)

22400

In [12]:
df.dropna(inplace=True)

In [13]:
df['drinking_habit'].isnull().sum()

0

In [14]:
df['can_hear_right_ear'].unique()

array(['Slightly Defective', 'Highly Defective'], dtype=object)

In [15]:
df

Unnamed: 0,candidate_id,gender,age,height_in_cm,weight_in_lbs,left_eyesight_lvl,right_eyesight_lvl,can_hear_left_ear,can_hear_right_ear,blood_pressure_lvl1,...,bad_cholestrol_lvl,hemoglobin_lvl,urea_lvl,creatinine_lvl,liver_enzyme_lvl1,liver_enzyme_lvl2,smoking_habit,drinking_habit,residential_area,triglyceride_lvl
0,CAN_1,Male,35,170,165.35,1.0,1.0,Slightly Defective,Slightly Defective,120.0,...,126.0,17.1,1.0,1.0,21.0,35.0,Does not Smoke,Y,Town,92.0
1,CAN_2,Male,30,180,176.37,0.9,1.2,Slightly Defective,Slightly Defective,130.0,...,148.0,15.8,1.0,0.9,20.0,36.0,Chain Smoker,N,Urban,121.0
2,CAN_3,Male,40,165,165.35,1.2,1.5,Slightly Defective,Slightly Defective,120.0,...,74.0,15.8,1.0,0.9,47.0,32.0,Does not Smoke,N,Rural,104.0
3,CAN_4,Male,50,175,176.37,1.5,1.2,Slightly Defective,Slightly Defective,145.0,...,104.0,17.6,1.0,1.1,29.0,34.0,Does not Smoke,N,Town,106.0
4,CAN_5,Male,50,165,132.28,1.0,1.2,Slightly Defective,Slightly Defective,138.0,...,117.0,13.8,1.0,0.8,19.0,12.0,Does not Smoke,N,Urban,104.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22395,CAN_22396,Female,50,175,110.23,0.8,1.0,Slightly Defective,Slightly Defective,100.0,...,164.0,14.7,1.0,0.6,26.0,21.0,Does not Smoke,Y,Town,52.0
22396,CAN_22397,Male,30,165,165.35,0.5,1.0,Slightly Defective,Slightly Defective,135.0,...,181.0,16.4,1.0,0.8,36.0,81.0,Chain Smoker,Y,Town,195.0
22397,CAN_22398,Female,65,150,132.28,0.8,0.8,Slightly Defective,Slightly Defective,153.0,...,180.0,13.6,1.0,0.6,22.0,14.0,Does not Smoke,N,Town,328.0
22398,CAN_22399,Male,50,170,121.25,1.0,1.0,Slightly Defective,Slightly Defective,113.0,...,189.0,16.1,1.0,0.9,23.0,12.0,Chain Smoker,N,Town,97.0


### Random forest 

In [38]:
# Load the data
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

# Handle missing values by filling with the mean
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

# Define categorical and numerical columns
categorical_columns = ['gender', 'residential_area', 'can_hear_right_ear', 'can_hear_left_ear', 'drinking_habit', 'smoking_habit']
numerical_columns = [col for col in train_df.columns if col not in categorical_columns + ['candidate_id', 'triglyceride_lvl']]

# Preprocessing for numerical data: Standardize features
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: One-hot encode categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),  # Apply numerical_transformer to numerical_columns
        ('cat', categorical_transformer, categorical_columns)  # Apply categorical_transformer to categorical_columns
    ])

# Define the model
model = RandomForestRegressor(n_estimators=500, random_state=42)

# Create a pipeline that combines the preprocessor with the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),  # First, apply the preprocessor
                           ('model', model)  # Then, fit the model
                          ])

# Prepare the features and target
X = train_df.drop(columns=['candidate_id', 'triglyceride_lvl'])  # Features
y = train_df['triglyceride_lvl']  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)

# Check the transformed training data
X_train_transformed = preprocessor.fit_transform(X_train)
print(f"Transformed X_train shape: {X_train_transformed.shape}")

# Fit the model
pipeline.fit(X_train, y_train)

# Validate the model
y_pred = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict on test data
test_X = test_df.drop(columns=['candidate_id'])
test_predictions = pipeline.predict(test_X)

# Prepare submission file
submission_df = pd.DataFrame({
    'candidate_id': test_df['candidate_id'],
    'triglyceride_lvl': test_predictions
})

  
  import sys


Transformed X_train shape: (20160, 32)
Mean Absolute Error: 15.425348214285714


In [39]:
submission_df.to_csv('sample_submission_rf_num_est_500_loss_15.42_split_0.10.csv', index=False)

### KNN

In [35]:
from sklearn.neighbors import KNeighborsRegressor

In [37]:
# Define the model
model = KNeighborsRegressor()

# Create a pipeline that combines the preprocessor with the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),  # First, apply the preprocessor
                           ('model', model)  # Then, fit the model
                          ])

# Prepare the features and target
X = train_df.drop(columns=['candidate_id', 'triglyceride_lvl'])  # Features
y = train_df['triglyceride_lvl']  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Check the transformed training data
X_train_transformed = preprocessor.fit_transform(X_train)
print(f"Transformed X_train shape: {X_train_transformed.shape}")

# Fit the model
pipeline.fit(X_train, y_train)

# Validate the model
y_pred = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict on test data
test_X = test_df.drop(columns=['candidate_id'])
test_predictions = pipeline.predict(test_X)

# Prepare submission file
submission_df = pd.DataFrame({
    'candidate_id': test_df['candidate_id'],
    'triglyceride_lvl': test_predictions
})
submission_df.to_csv('sample_submission_linear_regression.csv', index=False)


Transformed X_train shape: (20160, 32)
Mean Absolute Error: 48.42883928571429


### Linear regression

In [17]:
# Define the model
model = LinearRegression()

# Create a pipeline that combines the preprocessor with the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),  # First, apply the preprocessor
                           ('model', model)  # Then, fit the model
                          ])

# Prepare the features and target
X = train_df.drop(columns=['candidate_id', 'triglyceride_lvl'])  # Features
y = train_df['triglyceride_lvl']  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the transformed training data
X_train_transformed = preprocessor.fit_transform(X_train)
print(f"Transformed X_train shape: {X_train_transformed.shape}")

# Fit the model
pipeline.fit(X_train, y_train)

# Validate the model
y_pred = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict on test data
test_X = test_df.drop(columns=['candidate_id'])
test_predictions = pipeline.predict(test_X)

# Prepare submission file
submission_df = pd.DataFrame({
    'candidate_id': test_df['candidate_id'],
    'triglyceride_lvl': test_predictions
})
submission_df.to_csv('sample_submission_linear_regression.csv', index=False)


Transformed X_train shape: (17920, 32)
Mean Absolute Error: 45.34120396205357


### Gradient boosting model

In [18]:
# Define the model
model = GradientBoostingRegressor(n_estimators=250, learning_rate=0.1, random_state=42)

# Create a pipeline that combines the preprocessor with the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),  # First, apply the preprocessor
                           ('model', model)  # Then, fit the model
                          ])

# Prepare the features and target
X = train_df.drop(columns=['candidate_id', 'triglyceride_lvl'])  # Features
y = train_df['triglyceride_lvl']  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the transformed training data
X_train_transformed = preprocessor.fit_transform(X_train)
print(f"Transformed X_train shape: {X_train_transformed.shape}")

# Fit the model
pipeline.fit(X_train, y_train)

# Validate the model
y_pred = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict on test data
test_X = test_df.drop(columns=['candidate_id'])
test_predictions = pipeline.predict(test_X)

# Prepare submission file
submission_df = pd.DataFrame({
    'candidate_id': test_df['candidate_id'],
    'triglyceride_lvl': test_predictions
})
submission_df.to_csv('sample_submission_gradient_boosting.csv', index=False)


Transformed X_train shape: (17920, 32)
Mean Absolute Error: 17.100529639872978


### Using gridsearch to decide the n_estimator for randomforest

In [None]:
# Load the data
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

# Handle missing values by filling with the mean
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

# Define categorical and numerical columns
categorical_columns = ['gender', 'residential_area', 'can_hear_right_ear', 'can_hear_left_ear', 'drinking_habit', 'smoking_habit']
numerical_columns = [col for col in train_df.columns if col not in categorical_columns + ['candidate_id', 'triglyceride_lvl']]

# Preprocessing for numerical data: Standardize features
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: One-hot encode categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),  # Apply numerical_transformer to numerical_columns
        ('cat', categorical_transformer, categorical_columns)  # Apply categorical_transformer to categorical_columns
    ])

# Define the model
model = RandomForestRegressor(random_state=42)

# Create a pipeline that combines the preprocessor with the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),  # First, apply the preprocessor
                           ('model', model)  # Then, fit the model
                          ])

# Define the parameter grid for n_estimators
param_grid = {
    'model__n_estimators': [200, 250, 300, 400, 500, 550, 600]  # Define the range of values to search over
}

# Create GridSearchCV instance
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error')

# Prepare the features and target
X = train_df.drop(columns=['candidate_id', 'triglyceride_lvl'])  # Features
y = train_df['triglyceride_lvl']  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Negative Mean Absolute Error:", grid_search.best_score_)

# Predict on validation data using the best estimator found by GridSearchCV
y_pred = grid_search.best_estimator_.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error on Validation Set: {mae}')

# Predict on test data using the best estimator found by GridSearchCV
test_X = test_df.drop(columns=['candidate_id'])
test_predictions = grid_search.best_estimator_.predict(test_X)

  
  import sys


In [None]:
# Prepare submission file
submission_df = pd.DataFrame({
    'candidate_id': test_df['candidate_id'],
    'triglyceride_lvl': test_predictions
})
submission_df.to_csv('sample_submission_random_forest_gridsearchcv.csv', index=False)


In [22]:
submission_df.shape

(9600, 2)