In [1]:
import pandas as pd
import os
from pathlib import Path
from dotenv import load_dotenv
from supabase import create_client



dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

url: str = os.getenv('SUPABASE_URL')
key: str = os.getenv('SUPABASE_KEY')


def init():
    supabase = create_client(url, key)
    return supabase

def read_data():      
      supabase = init()
      response = supabase.table('muaban').select("*").execute()
      df_muaban = pd.DataFrame(response.data)
      response = supabase.table('mogi').select("*").execute()
      df_mogi = pd.DataFrame(response.data)
      response= supabase.table('rongbay').select("*").execute()
      df_rongbay = pd.DataFrame(response.data)
      df_concatenated  = pd.concat([df_mogi, df_muaban, df_rongbay], ignore_index=True)
      df_concatenated['id'] = range(1, len(df_concatenated) + 1)
      return df_concatenated
      
df = read_data()

## Preprocessing


In [2]:
import json
from random import randint
import random


import numpy as np
np.random.seed(42)



with open('dags/location.json', 'r', encoding='utf-8') as file:
    location = json.load(file)

def get_district_name_by_ward(location, ward_name):
    for district in location["district"]:
        if ward_name in district["wards"]:
            return district["name"]
    return None 

def get_ward_by_street(location, street_name):
    # Duyệt qua các district trong location
    for district in location["district"]:
        if street_name in district["streets"]:
            index = district["streets"].index(street_name)
            if index < len(district["wards"]):
                return district["wards"][index]
    return None  

def get_street_by_ward(location, ward):
    for district in location["district"]:
        if ward in district["wards"]:
            index = len(district['streets'])
            return district["streets"][randint(0, index-1)]
        else: 
            return district["streets"][randint(0, len(district['streets'])-1)]
    return None

def get_random_ward():
    district = random.choice(location["district"])
    if district['wards']:
        return np.random.choice(district['wards'])
    else:
        return "Lò Đúc"
def get_random_street():
    district = random.choice(location["district"])
    if district['streets']:
        return np.random.choice(district['streets'])
    else:
        return "Phố Lò Đúc"
def get_random_district():
    return np.random.choice(location["district"])['name']



for index, row in df.iterrows():
# get district by ward if none return random district
    if row['district'] == '' or pd.isnull(row['district']):
        if row['street'] == '':
            # get random district 
            row['district'] = get_random_district()
        else: 
            ward = row['ward']
            df.at[index, 'district'] = get_district_name_by_ward(location, ward)

# get ward by street if none return random ward
    if row['ward'] == '' or pd.isnull(row['ward']):
        if row['street'] == '':
            # get random ward 
            row['ward'] = get_random_ward()
        else:
            street = row['street']
            df.at[index, 'ward'] = get_ward_by_street(location, street)

# get street by ward if none return random street
    if row['street'] == '' or pd.isnull(row['street']):
        if row['ward'] == '':
            row['street'] = get_random_street()
        else: 
            ward = row['ward']
            df.at[index, 'street'] = get_street_by_ward(location, ward)



    if row['direction'] == '':
        df.at[index, 'direction'] = 0

    # if row['price'] <= 0.5:
    #     df = df.drop(index)

    # if row['area'] == 0:
    #     df = df.drop(index)
missing_values = df.isnull().sum()
missing_values
# drop rows with missing values



id                  0
created_at          0
price               0
area                0
street              0
ward              139
district           66
post_date           0
num_bedroom         0
num_diningroom      0
num_kitchen         0
num_toilet          0
num_floor           0
current_floor       0
direction           0
street_width        0
dtype: int64

In [None]:
missing_values = df.isnull().sum()
missing_values

In [None]:
import numpy as np
np.random.seed(1)
random_district = np.random.choice(location["district"])
print(random.choice(random_district['wards']))

In [None]:
df.to_csv('data.csv', index=False)
df.head()

In [None]:
missing_values = df.isnull().sum()
missing_values
#

In [None]:
from scipy import stats
data = pd.read_csv('data.csv')
data['price_zscore'] = stats.zscore(data['price'])
data['area_zscore'] = stats.zscore(data['area'])
price_outliers_zscore = data[(data['price_zscore'].abs() > 0.3)]
area_outliers_zscore = data[(data['area_zscore'].abs() > 1.36)]
outliers_zscore = pd.concat([price_outliers_zscore, area_outliers_zscore]).drop_duplicates()
outliers_zscore.head()
# Remove outliers
data = data.drop(outliers_zscore.index)
# data = data.to_csv('data_pre.csv', index=False)

# in ra các data có giá trị price thấp nhất và cao nhất tại cột area = 20 
# data[data['area'] <= 10].sort_values(by='price', ascending=False).head(1)
# data[data['price_zscore'].abs() >= 0.2].sort_values(by='area', ascending=True).head(20)
#in ra các district có giá trị null

# cho các district nhận giá trị ngẫu nhiên 
data['district'] = data['district'].fillna(np.random.choice(location["district"]))

In [None]:
data[data['district'].isnull()].head(1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import mlflow
from time import strftime

# Drop unnecessary columns
data = data.drop(columns=['id', 'created_at', 'post_date', 'current_floor', 'num_floor', 'direction', 'street_width', 'price_zscore', 'area_zscore'])

# Identify categorical and numerical columns
categorical_cols = ['street', 'ward', 'district']
numerical_cols = data.drop(columns=['price'] + categorical_cols).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the models to be evaluated
models = {
    # "LinearRegression": LinearRegression(),
    # "Lasso": Lasso(),
    # "Ridge": Ridge(),
    # "ElasticNet": ElasticNet(),
    "RandomForestRegressor": RandomForestRegressor()
}

# Split the data into training and testing sets
X = data.drop(columns='price')
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Iterate over each model, train it, and evaluate its performance
for model_name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])
    
    # Train the model
    clf.fit(X_train, y_train)
    
    # Store the model with a timestamp
    time = strftime("%Y-%m-%d %H:%M:%S")
    mlflow.sklearn.log_model(clf, f"models/{model_name}_{time}")
    
    # Preprocess test data and get predictions
    y_pred = clf.predict(X_test)
    
    # Evaluate the model using mse
    mse = mean_absolute_error(y_test, y_pred)
    print(f"{model_name} MAE: {mse:.2f}")
    
    # Log the mse metric
    mlflow.log_metric("mse", mse)
    
    # Log the model parameters
    mlflow.sklearn.log_model(clf, f"models/{model_name}_{time}")
    
    # Log the model performance
    mlflow.log_metric("mse", mse)
    
    # Log the model parameters
    mlflow.sklearn.log_model(clf, f"models/{model_name}_{time}")
    
    # Log the model performance
    mlflow.log_metric("mse", mse)
    
    # Log the model parameters
    mlflow.sklearn.log_model(clf, f"models/{model_name}_{time}")
    
    # Log the model performance
    mlflow.log_metric("mse", mse)



In [None]:
X_test.head()
# y_test.head(1)




In [None]:
y_test.head()


In [None]:
y_pred[:5]

In [None]:
import mlflow



#load model 
# model = mlflow.sklearn.load_model("mlruns/0/247ce27ff3ea4d619c4122354155f93c/artifacts/models/RandomForestRegressor_2024-05-25 08:20:05")
model = mlflow.sklearn.load_model("mlruns/0/247ce27ff3ea4d619c4122354155f93c/artifacts/models/RandomForestRegressor_2024-05-25 08:51:02")
#make example test
# tạo một dữ liệu test mới khong có trong tệp dữ liệ 

example ={
  "area": 30,
  "street": "Đại La",
  "ward":  "Trương Định",
  "district": "Hai Bà Trưng",
  "num_bedroom": 0,
  "num_diningroom": 0,
  "num_kitchen": 0,
  "num_toilet": 0
}
example = pd.DataFrame(example, index=[0])


model.predict(example)



