In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/homework-1-dataset/sample_submission.csv
/kaggle/input/homework-1-dataset/data_description.txt
/kaggle/input/homework-1-dataset/train.csv
/kaggle/input/homework-1-dataset/test.csv


In [3]:
df = pd.read_csv('/kaggle/input/homework-1-dataset/train.csv')

# Train/Test Split and Feature Engineering

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

X_train, X_test = train_test_split(df,test_size=0.2, random_state=42)

# dropping null values
null_ratio = X_train.isna().mean()
too_much_null_columns = null_ratio[null_ratio >= 0.5]
X_train.drop(columns = too_much_null_columns.index, inplace = True)
X_test.drop(columns = too_much_null_columns.index, inplace = True)

# filling numbers with average value
for col in X_train.select_dtypes(include=['number']):
    change_null_to_this = X_train[col].mean()
    X_train.fillna(change_null_to_this, inplace=True)
    X_test.fillna(change_null_to_this, inplace=True)

# filling categoricals with mode value
for col in X_train.select_dtypes(include=['object']):
    change_null_to_this = X_train[col].mode()
    X_train.fillna(change_null_to_this, inplace=True)
    X_test.fillna(change_null_to_this, inplace=True)

# handling categorical values
for col in X_train.select_dtypes(include=['object']):
    encoder = OrdinalEncoder()
    
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)
    
    X_train[col] = encoder.fit_transform(X_train[[col]])
    X_test[col] = encoder.fit_transform(X_test[[col]])

In [5]:
X_train.select_dtypes(include=['object']).shape, X_test.select_dtypes(include=['object']).shape

((1168, 0), (292, 0))

# Normal Training

In [6]:
Y_train = X_train.pop('SalePrice')
Y_test = X_test.pop('SalePrice')

In [7]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((1168, 75), (1168,), (292, 75), (292,))

In [8]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(
    max_depth=10,
    min_samples_split=25,
    min_samples_leaf=7,
    random_state=42
)
model.fit(X_train, Y_train)

prob_train = model.predict(X_train)
prob_test = model.predict(X_test)

In [9]:
from sklearn.metrics import *

print("Train:")
r2 = r2_score(Y_train, prob_train)
print(f"R2 score: {r2}")
rmse = mean_squared_error(Y_train, prob_train, squared=False)
print(f"RMSE score: {rmse}")
mae = mean_absolute_error(Y_train, prob_train)
print(f"MAE score: {mae}")

print()
print("Test: ")

r2 = r2_score(Y_test, prob_test)
print(f"R2 score: {r2}")
rmse = mean_squared_error(Y_test, prob_test, squared=False)
print(f"RMSE score: {rmse}")
mae = mean_absolute_error(Y_test, prob_test)
print(f"MAE score: {mae}")

Train:
R2 score: 0.892104434974062
RMSE score: 25368.258382297576
MAE score: 16569.710859214472

Test: 
R2 score: 0.8332468522717298
RMSE score: 35763.82304999141
MAE score: 23929.70959915453


# using k-fold cross validation

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

pipeline = Pipeline(steps=[
    ('regressor', DecisionTreeRegressor(random_state=42))
])

scalers = [
    StandardScaler(),
    None
]

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'regressor__max_depth': [5, 10, None],
    'regressor__min_samples_split': [5, 15, 25],
    'regressor__min_samples_leaf': [3, 5, 7]
}

In [14]:
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=kfold,
    scoring='neg_root_mean_squared_error',
    verbose=2,
    return_train_score=True
)

In [18]:
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=5; total time=   0.0s
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=5; total time=   0.0s
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=5; total time=   0.0s
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=5; total time=   0.0s
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=5; total time=   0.0s
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=15; total time=   0.0s
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=15; total time=   0.0s
[CV] END regressor__max_depth=5, regressor__min_samples_leaf=3, regressor__min_samples_split=15; total time=   0.0s
[CV] END regres

In [19]:
results = pd.DataFrame(grid_search.cv_results_)
results = results.sort_values('mean_test_score')

In [20]:
print(results[[
    'mean_train_score',
    'mean_test_score'
]])


    mean_train_score  mean_test_score
0      -28569.521127    -41831.554262
9      -15126.089058    -41331.687757
1      -29418.834382    -41191.663487
18     -14019.814819    -41149.545872
2      -30883.666521    -40919.661194
5      -31038.062216    -39336.926729
10     -20593.217260    -39159.035859
11     -25121.037869    -39132.344495
19     -20280.319117    -39121.323390
4      -29446.642904    -39034.768270
20     -24992.552973    -39027.073090
8      -31774.358314    -38967.280777
7      -30622.171305    -38936.267357
6      -30574.601658    -38930.469633
3      -29126.206796    -38778.830399
16     -24478.002341    -37337.759345
23     -25858.622750    -37323.662076
25     -24359.977947    -37316.707227
14     -25935.129089    -37296.463296
24     -24219.948915    -37265.918631
15     -24344.624281    -37253.288994
13     -22268.673240    -37169.797883
22     -22078.116881    -37131.879467
12     -20537.726007    -37115.764851
21     -20166.491579    -37097.372142
17     -2732

In [21]:
best_pipeline = grid_search.best_estimator_

In [22]:
prob_train = best_pipeline.predict(X_train)
prob_test = best_pipeline.predict(X_test)

In [23]:
from sklearn.metrics import *

print("Train:")
r2 = r2_score(Y_train, prob_train)
print(f"R2 score: {r2}")
rmse = mean_squared_error(Y_train, prob_train, squared=False)
print(f"RMSE score: {rmse}")
mae = mean_absolute_error(Y_train, prob_train)
print(f"MAE score: {mae}")

print()
print("Test: ")

r2 = r2_score(Y_test, prob_test)
print(f"R2 score: {r2}")
rmse = mean_squared_error(Y_test, prob_test, squared=False)
print(f"RMSE score: {rmse}")
mae = mean_absolute_error(Y_test, prob_test)
print(f"MAE score: {mae}")

Train:
R2 score: 0.892718983546822
RMSE score: 25295.909302256132
MAE score: 16435.683391016515

Test: 
R2 score: 0.8326041930982204
RMSE score: 35832.67286502578
MAE score: 24015.856336318928


In [24]:
!pip install mlflow dagshub
import mlflow
import dagshub

dagshub.init(repo_owner='Givi-Modebadze', repo_name='my-first-repo', mlflow=True)

experiment = "my second experimnet"
run_name = "Decision Tree with cleaned features"
mlflow.set_experiment(experiment)

Collecting mlflow
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.3->mlflow)
  Downloading databricks_sdk-0.49.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=8a839f13-b467-46be-b545-a95d29fe23b4&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=210b5b139e4973abb23317db591189da9ed7a4e4221a9987f7b4da388ec56e6c




Output()

<Experiment: artifact_location='mlflow-artifacts:/058f098621a44d7096faef4e4c751472', creation_time=1744222875012, experiment_id='3', last_update_time=1744222875012, lifecycle_stage='active', name='my second experimnet', tags={}>

In [25]:
X_train.shape

(1168, 75)

In [402]:
mlflow.start_run(run_name=run_name)
mlflow.log_param("model_type", "DecisionTreeRegressor")
mlflow.log_param("random_state", 42)
mlflow.log_param("N-features", 75)
mlflow.log_param("cv_folds", kfold.n_splits)

mlflow.log_metrics({
    'test_r2': r2,
    'test_rmse': rmse,
    'test_mae': mae
})

r2 = r2_score(Y_train, prob_train)
rmse = mean_squared_error(Y_train, prob_train, squared=False)
mae = mean_absolute_error(Y_train, prob_train)

mlflow.log_metrics({
    'train_r2': r2,
    'train_rmse': rmse,
    'train_mae': mae
})

mlflow.log_param("best_params", grid_search.best_params_)
mlflow.log_metric("best_cv_score", grid_search.best_score_)

mlflow.sklearn.log_model(best_pipeline, "DecisionTreeRegressor")

mlflow.end_run()



🏃 View run Decision Tree with cleaned features at: https://dagshub.com/Givi-Modebadze/my-first-repo.mlflow/#/experiments/3/runs/40ff03f6458e466d80038f7b49ac88af
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/my-first-repo.mlflow/#/experiments/3
