In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import numpy as np
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge

In [None]:
data = pd.read_excel(r'data\flats_to_rent_wue_preprocessed_0407.xlsx')


data.dropna(subset=['ConstructionYear'], inplace=True)
data.dropna(subset=['Object_price'], inplace=True)
data.dropna(subset=['Rooms'], inplace=True)
# data.drop('Title', axis=1, inplace=True)
# data.drop('Object_currency', axis=1, inplace=True)
# data.drop('Url', axis=1, inplace=True)
# data.drop('EstateType', axis=1, inplace=True)
# data.drop('DistributionType', axis=1, inplace=True)
data = data.reindex()
data = data.reset_index(drop=True)

y = data['Object_price']
X = data.drop('Object_price', axis=1)

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                  train_size=0.8, 
                                                  random_state = 0)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, 
                                                  train_size=0.8, 
                                                  random_state = 0)

num_cols = [col for col in train_X.columns if train_X[col].dtype == 'float64']
cat_cols = [col for col in train_X.columns if train_X[col].dtype == 'object']
bin_cols = [col for col in train_X.columns if train_X[col].dtype == 'int64']

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore')

print (train_X[cat_cols])
X_train_categorical_xgb = encoder.fit_transform(train_X[cat_cols]).toarray()
X_valid_categorical_xgb = encoder.transform(val_X[cat_cols]).toarray()
X_test_categorical_xgb = encoder.transform(test_X[cat_cols]).toarray()
X_train_processed = np.concatenate([train_X[num_cols], X_train_categorical_xgb], axis=1)
X_valid_processed = np.concatenate([val_X[num_cols], X_valid_categorical_xgb], axis=1)
X_test_processed = np.concatenate([test_X[num_cols], X_test_categorical_xgb], axis=1)
X_train_processed = np.concatenate([train_X[bin_cols], X_train_processed], axis=1)
X_valid_processed = np.concatenate([val_X[bin_cols], X_valid_processed], axis=1)
X_test_processed = np.concatenate([test_X[bin_cols], X_test_processed], axis=1)

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
mlflow.end_run()

# Basic Baseline


In [None]:
avg_price_per_sqm_buy = 4070.62
avg_price_per_sqm_rent = 11.21


import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Prepare baseline prediction using the living room feature
baseline_preds = X_valid_processed['living_room'] * avg_price_per_sqm_rent

# Calculate evaluation metrics for the baseline prediction
baseline_mae = mean_absolute_error(val_y, baseline_preds)
baseline_r2 = r2_score(val_y, baseline_preds)
baseline_mse = mean_squared_error(val_y, baseline_preds)

# Log baseline metrics to MLflow
with mlflow.start_run(run_name="baseline"):
    mlflow.log_metric("mse", baseline_mse)
    mlflow.log_metric("mae", baseline_mae)
    mlflow.log_metric("r2", baseline_r2)

print(f"Baseline Mae: {baseline_mae}")
print(f"Baseline R2 Score: {baseline_r2}")


# XGBoost

In [None]:
mlflow.xgboost.autolog()
mlflow.start_run()
model = xgb.XGBRegressor(eval_metric=['rmse', 'mae', 'logloss'], early_stopping_rounds=7)
model.fit(X=X_train_processed, 
          y=train_y,
          eval_set=[(X_valid_processed, val_y)],

          verbose=True)
mlflow.xgboost.log_model(model, "xgboost")
preds = model.predict(X_test_processed)
mae = mean_absolute_error(test_y, preds)
mlflow.log_metric("mae", mae)
mlflow.end_run()

## Linear Regression


In [None]:
# print all dtypes of the use object 

print (train_X.)


In [None]:
    import mlflow
    import mlflow.sklearn
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import r2_score

    # convert all to 

    mlflow.end_run()
  
    mlflow.start_run(run_name="linear-regression", description="Basic linear Regression with scikit-learn. All features used.")
    model = LinearRegression()
    model.fit(X_train_processed, train_y)
    mlflow.sklearn.log_model(model, "linear-regression-model")
    preds = model.predict(X_valid_processed)

    mae = mean_absolute_error(val_y, preds)
    r2 = r2_score(val_y, preds)
    mse = mean_squared_error(val_y, preds)

    mlflow.log_metric("mse2", mse)
    mlflow.log_metric("mae2", mae)
    mlflow.log_metric("r2", r2)

    print(f"Mae:{mae}")
    r2 = r2_score(val_y, preds)
    score = mean_absolute_error(val_y, preds)
    mlflow.end_run()

In [None]:
mlflow.end_run()

In [None]:
# Create a run object in the experiment
model_save_path = "model"




mlflow.sklearn.autolog()

with mlflow.start_run() as run:
    # Log the algorithm parameter alpha to the run
    mlflow.log_metric('alpha', 0.03)
    # Create, fit, and test the scikit-learn Ridge regression model

    model = LinearRegression()
    model.fit(X_train_processed, train_y)
    model.fit(X_train_processed, train_y)
    preds = model.predict(test_X)

    # Log mean squared error
    print('Mean Squared Error is', mean_squared_error(test_y, preds))
    mlflow.log_metric('mse', mean_squared_error(test_y, preds))
    
    # Save the model to the outputs directory for capture
    mlflow.sklearn.log_model(model,model_save_path)
    
    # Plot actuals vs predictions and save the plot within the run
    fig = plt.figure(1)
    idx = np.argsort(data['test']['y'])
    plt.plot(test_y[idx],preds[idx])
    fig.savefig("actuals_vs_predictions.png")
    mlflow.log_artifact("actuals_vs_predictions.png") 