In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from pathlib import Path 

In [None]:
PROCESSED_DATA=Path("../data/processed/cleaned_train.csv").resolve()

In [None]:
df=pd.read_csv(PROCESSED_DATA)

In [None]:
features= df.columns[:-1].tolist()
num_cols = df[features].select_dtypes(include=np.number).columns.to_list()
cat_cols = df[features].select_dtypes(exclude=np.number).columns.to_list()

In [None]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error

In [None]:
processor = ColumnTransformer([
    ("categorical",OneHotEncoder(handle_unknown="ignore",sparse_output=False),cat_cols),
    ("numerical",StandardScaler(with_mean=False),num_cols)
    
])

In [None]:
pipeline=Pipeline([
    ('processor',processor),
    ('predictor',XGBRegressor())
])

In [None]:
X,y= df[features],df[df.columns[-1]]

In [None]:
scores =cross_val_score(pipeline,X,y,cv=4,scoring='neg_root_mean_squared_error',n_jobs=3)
scores

In [None]:
pipeline.fit(X,y)

In [None]:
root_mean_squared_error(y,pipeline.predict(X))

In [None]:
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn as mlsk

In [None]:
params={
   "base_score": 0.4,
    "objective": "reg:tweedie",
    "colsample_bylevel": 0.9,
    "colsample_bytree": 0.9,
    "early_stopping_rounds": 50,
    "eval_metric": [
               "rmse",
               "rmsle"
    ],
    "gamma": 0.005,
    "learning_rate": 0.05,
    "max_depth": 8,
    "min_child_weight": 0.6,
    "n_estimators": 5000,
    "n_jobs": -1,
    "random_state": 0,
    "reg_alpha": 0.005,
    "reg_lambda": 0.005,
    "subsample": 0.9}

In [None]:
TRACKING_PATH=Path("../mlruns").resolve()
TRACKING_PATH.mkdir(exist_ok=True)

In [None]:
mlflow.set_tracking_uri(f"file://{TRACKING_PATH}")

In [None]:
mlflow.set_experiment(experiment_name="XGBRegressor")


In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
space_output=False
with_mean=False
with_std=True
params={
   "base_score": 0.4,
    "objective": "reg:tweedie",
    "colsample_bylevel": 0.9,
    "colsample_bytree": 0.9,
    "early_stopping_rounds": 50,
    "eval_metric": [
               "rmse",
               "rmsle"
    ],
    "gamma": 0.005,
    "learning_rate": 0.05,
    "max_depth": 8,
    "min_child_weight": 0.6,
    "n_estimators": 5000,
    "n_jobs": -1,
    "random_state": 0,
    "reg_alpha": 0.005,
    "reg_lambda": 0.005,
    "subsample": 0.9}
with mlflow.start_run(run_name="xgb_rg"):
    processor = ColumnTransformer([
    ("categorical",OneHotEncoder(handle_unknown="ignore",sparse_output=space_output),cat_cols),
    ("numerical",StandardScaler(with_mean=with_mean,with_std=with_std),num_cols)
    
    ])
    pipeline=Pipeline([
    ('processor',processor),
    ('predictor',XGBRegressor(**params))
])
    pipeline.fit(X_train,y_train,predictor__eval_set=[(
        processor.transform(X_val),y_val
    )])
    mlflow.log_metric("rmse",pipeline.named_steps['predictor'].best_score)
    mlflow.log_metric("best_n_estimator",pipeline.named_steps['predictor'].best_iteration)

    feature_names = pipeline.named_steps['processor'].get_feature_names_out()
    importances = pipeline.named_steps['predictor'].feature_importances_
    fi_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    }).sort_values("importance", ascending=False)
    csv_path='features_importance.csv'
    fi_df.to_csv(csv_path)
    mlflow.log_artifact(csv_path)
    
    fig=plt.figure(figsize=(10,6))
    fi_df.plot(kind='barh', x='feature', y='importance', legend=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.title(" Feature Importances")
    fig.savefig("feature_importances.png")
    mlflow.log_artifact("feature_importances.png")
    

    