In [16]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression,Ridge

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [17]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
data_dir = "/content/drive/MyDrive/Dacon_Basic/DAT/"

In [20]:
train = pd.read_csv(data_dir+'train.csv')
test = pd.read_csv(data_dir+'test.csv')
sample_submission = pd.read_csv(data_dir+'sample_submission.csv', index_col = 0)

In [21]:
train_x = train.drop(['ID', 'Calories_Burned'], axis = 1)
train_y = train['Calories_Burned']
test_x = test.drop('ID', axis = 1)

In [22]:
ordinal_features = ['Weight_Status','Gender']
for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature] = le.transform(test_x[feature])

In [25]:
degree_list= list(range(1,10))

In [26]:
for degrees in degree_list:

  train_x = train.drop(['ID', 'Calories_Burned'], axis = 1)
  train_y = train['Calories_Burned']
  test_x = test.drop('ID', axis = 1)

  ordinal_features = ['Weight_Status','Gender']
  for feature in ordinal_features:
      le = LabelEncoder()
      le = le.fit(train_x[feature])
      train_x[feature] = le.transform(train_x[feature])
      for label in np.unique(test_x[feature]):
          if label not in le.classes_:
              le.classes_ = np.append(le.classes_, label)
      test_x[feature] = le.transform(test_x[feature])
  print(f"============================================degrees : {str(degrees)}==================================================")
  train_ex=train_x.copy()
  train_ex, val_x, train_ex_y, val_y = train_test_split(train_ex, train_y, test_size=0.1, random_state=1234, shuffle=True)
  poly = PolynomialFeatures(degree=degrees, interaction_only=True)
  poly.fit(train_ex)

  train_poly = pd.DataFrame(poly.transform(train_ex))
  val_poly = pd.DataFrame(poly.transform(val_x))
  # train_poly # 전처리 이후

  LR = LinearRegression()
  LR.fit(train_poly,train_ex_y)
  pred=LR.predict(val_poly)

  start_MSE = mean_squared_error(val_y,pred,squared=False)

  cnt = 0
  while cnt<len(train_x.columns):
    cnt = cnt+1
    mse= []
    for col in list(train_x.columns):
      train_ex = train_x.drop(columns=col)

      train_ex, val_x, train_ex_y, val_y = train_test_split(train_ex, train_y, test_size=0.1, random_state=1234, shuffle=True)

      poly = PolynomialFeatures(degree=degrees, interaction_only=True)
      poly.fit(train_ex)

      train_poly = pd.DataFrame(poly.transform(train_ex))
      val_poly = pd.DataFrame(poly.transform(val_x))
      # train_poly # 전처리 이후

      LR = LinearRegression()
      LR.fit(train_poly,train_ex_y)
      pred=LR.predict(val_poly)
      mse.append(mean_squared_error(val_y,pred,squared=False))

      print(f'{col}_drop : {mean_squared_error(val_y,pred,squared=False)}')

    Min_MSE = min(mse)

    if Min_MSE < start_MSE:
      for j in range(len(mse)):
        if mse[j]==min(mse):
          drop_col = list(train_x.columns)[j]
          train_x.drop(columns=drop_col)
          break
    else:
      break
    print("==========================================drop_col_name==========================================")
    print(f'{drop_col} : {min(mse)}')
    print("====================================================================================")
    start_MSE = Min_MSE.copy()

Exercise_Duration_drop : 22.136613878435323
Body_Temperature(F)_drop : 12.937092834166796
BPM_drop : 14.88883499972255
Height(Feet)_drop : 11.505376652257914
Height(Remainder_Inches)_drop : 11.508768066034161
Weight(lb)_drop : 11.527936964100839
Weight_Status_drop : 11.498235360414132
Gender_drop : 11.526875305918873
Age_drop : 14.219049551880383
Exercise_Duration_drop : 14.359859449464869
Body_Temperature(F)_drop : 3.1018724106000812
BPM_drop : 11.902304765464338
Height(Feet)_drop : 2.969991784937769
Height(Remainder_Inches)_drop : 2.9707405479305655
Weight(lb)_drop : 3.050060905736325
Weight_Status_drop : 2.967707048677312
Gender_drop : 4.066523658270065
Age_drop : 10.593497719852197
Exercise_Duration_drop : 14.099868746577199
Body_Temperature(F)_drop : 0.2895638672935343
BPM_drop : 11.551631951928433
Height(Feet)_drop : 0.2872775814849116
Height(Remainder_Inches)_drop : 0.2878673980806592
Weight(lb)_drop : 0.820805182110917
Weight_Status_drop : 0.28745733063977075
Gender_drop : 3.16

In [27]:
train = pd.read_csv(data_dir+'train.csv')
test = pd.read_csv(data_dir+'test.csv')
sample_submission = pd.read_csv(data_dir+'sample_submission.csv', index_col = 0)

In [28]:
train_x = train.drop(['ID', 'Calories_Burned'], axis = 1)
train_y = train['Calories_Burned']
test_x = test.drop('ID', axis = 1)

In [29]:
ordinal_features = ['Weight_Status','Gender']
for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature] = le.transform(test_x[feature])

In [30]:
train_x = train_x.drop(columns="Height(Feet)")

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.1, random_state=1234, shuffle=True)

poly = PolynomialFeatures(degree=4, interaction_only=True)
poly.fit(train_x)

train_poly = pd.DataFrame(poly.transform(train_x))
val_poly = pd.DataFrame(poly.transform(val_x))
# train_poly # 전처리 이후

LR = LinearRegression()
LR.fit(train_poly,train_ex_y)
pred_LR=LR.predict(val_poly)
# mse.append(mean_squared_error(val_y,pred,squared=False))

Ri = Ridge()
Ri.fit(train_poly,train_ex_y)
pred_Ri=Ri.predict(val_poly)

print(f'{mean_squared_error(val_y,pred_LR,squared=False)}')
print(f'{mean_squared_error(val_y,pred_Ri,squared=False)}')

0.28565092811735543
0.28645823905644946


In [32]:
train_poly

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,162
0,1.0,19.0,104.9,108.0,1.0,121.3,0.0,0.0,21.0,1993.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,26.0,105.3,110.0,2.0,209.4,2.0,1.0,78.0,2737.8,...,17160.0,46068.0,3593304.0,1796652.0,17160.0,837.6,65332.8,32666.4,312.0,32666.4
2,1.0,25.0,105.6,103.0,6.0,160.9,2.0,1.0,37.0,2640.0,...,22866.0,33145.4,1226379.8,613189.9,7622.0,1930.8,71439.6,35719.8,444.0,11906.6
3,1.0,3.0,101.7,75.0,9.0,165.3,0.0,0.0,53.0,305.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,20.0,105.3,103.0,6.0,138.9,0.0,0.0,51.0,2106.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6745,1.0,1.0,99.9,82.0,2.0,132.3,0.0,0.0,26.0,99.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6746,1.0,2.0,100.8,84.0,1.0,183.0,0.0,0.0,37.0,201.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6747,1.0,9.0,103.6,85.0,4.0,191.8,0.0,1.0,49.0,932.4,...,16660.0,0.0,0.0,798847.0,0.0,0.0,0.0,37592.8,0.0,0.0
6748,1.0,7.0,102.7,83.0,2.0,189.6,0.0,1.0,29.0,718.9,...,4814.0,0.0,0.0,456367.2,0.0,0.0,0.0,10996.8,0.0,0.0


In [39]:
!pip install autogluon

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autogluon
  Downloading autogluon-0.7.0-py3-none-any.whl (9.7 kB)
Collecting autogluon.tabular[all]==0.7.0
  Downloading autogluon.tabular-0.7.0-py3-none-any.whl (292 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.2/292.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.timeseries[all]==0.7.0
  Downloading autogluon.timeseries-0.7.0-py3-none-any.whl (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.7/108.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==0.7.0
  Downloading autogluon.multimodal-0.7.0-py3-none-any.whl (331 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.1/331.1 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.core[all]==0.7.0
  Downloading autogluon.core-0.7.0-py3-none-any.whl (218 kB)
[2K     [90m━━━━━━━━

In [33]:
pred_LR = pd.DataFrame(pred_LR)
pred_LR.columns = ["pred_LR"]

pred_Ri = pd.DataFrame(pred_Ri)
pred_Ri.columns = ["pred_Ri"]

In [34]:
stacking_x = pd.concat([pred_LR,pred_Ri],axis=1)

In [35]:
stacking_df = pd.concat([pred_LR,pred_Ri,val_y.reset_index(drop=True)],axis=1)

In [36]:
stacking_df

Unnamed: 0,pred_LR,pred_Ri,Calories_Burned
0,95.581430,95.586683,96.0
1,185.796187,185.803692,186.0
2,105.371460,105.365106,105.0
3,140.553858,140.563776,141.0
4,152.853168,152.851053,153.0
...,...,...,...
745,59.509541,59.527198,60.0
746,32.180365,32.152940,32.0
747,123.446555,123.459084,124.0
748,39.962811,39.970150,40.0


In [37]:
poly_test_x = pd.DataFrame(poly.transform(test_x.drop(columns = "Height(Feet)")))

pred_test_LR=LR.predict(poly_test_x)
pred_test_Ri=Ri.predict(poly_test_x)

pred_test_LR = pd.DataFrame(pred_test_LR)
pred_test_LR.columns = ["pred_LR"]

pred_test_Ri = pd.DataFrame(pred_test_Ri)
pred_test_Ri.columns = ["pred_Ri"]

stacking_test_x = pd.concat([pred_test_LR,pred_test_Ri],axis=1)

In [40]:
from autogluon.tabular import TabularDataset, TabularPredictor

predictor = TabularPredictor(label='Calories_Burned').fit(train_data=stacking_df)
predictions = predictor.predict(stacking_test_x)

No path specified. Models will be saved in: "AutogluonModels/ag-20230425_022029/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230425_022029/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sat Dec 10 16:00:40 UTC 2022
Train Data Rows:    750
Train Data Columns: 2
Label Column: Calories_Burned
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (273.0, 1.0, 90.54933, 62.97675)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11945.39 MB

In [None]:
sub = pd.read_csv("/content/drive/MyDrive/Dacon_Basic/DAT/sample_submission.csv")

In [None]:
sub["Calories_Burned"] = np.round(predictions)

In [None]:
sub.to_csv("/content/drive/MyDrive/Dacon_Basic/OUT/samplesubmisson.csv",index=False)