In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn

In [None]:
# Load raw data
train_df = pd.read_csv('data/raw/Train.csv')
test_df = pd.read_csv('data/raw/Test.csv')

In [None]:
# Visualize missing data in the training dataset
def visualize_missing_data(df):
    missing_data = df.isnull().sum()

    plt.figure(figsize=(10, 6))
    missing_data.plot(kind='bar')
    plt.title('Missing Data in Each Column')
    plt.xlabel('Columns')
    plt.ylabel('Number of Missing Values')
    plt.show()

    plt.figure(figsize=(14, 10))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Heatmap of Missing Data')
    plt.show()

visualize_missing_data(train_df)

In [None]:
# Filter data by missing values (keeping columns with <= 25% missing data)
def filter_dataframe_by_missing_data(df, threshold=25):
    missing_data = df.isnull().sum()
    missing_data_percentage = (missing_data / len(df)) * 100
    features_with_less_missing_data = missing_data_percentage[missing_data_percentage <= threshold].index.tolist()
    filtered_df = df[features_with_less_missing_data]
    return filtered_df

filtered_train_df = filter_dataframe_by_missing_data(train_df)
filtered_test_df = filter_dataframe_by_missing_data(test_df)

In [None]:
# Impute missing values using a combination of simple and advanced methods
class SafeLabelEncoder:
    """Custom LabelEncoder to handle unseen labels."""
    def __init__(self):
        self.le = LabelEncoder()
        self.classes_ = None

    def fit(self, y):
        self.le.fit(y)
        self.classes_ = set(self.le.classes_)
        return self

    def transform(self, y):
        return np.array([self.le.transform([label])[0] if label in self.classes_ else -1 for label in y])

    def fit_transform(self, y):
        self.fit(y)
        return self.transform(y)

In [None]:
def create_imputation_models(filtered_train, num_rows=None):
    if num_rows is None:
        num_rows = len(filtered_train)

    df = filtered_train.copy().iloc[:num_rows]
    numeric_cols = [col for col in df.columns if df[col].dtype == 'float64' and df[col].nunique() > 5]
    categorical_cols = [col for col in df.columns if col not in numeric_cols]

    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

    label_encoders = {col: SafeLabelEncoder().fit(df[col]) for col in categorical_cols}
    df[categorical_cols] = df[categorical_cols].apply(lambda col: label_encoders[col.name].transform(col))

    regressors = {}
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            non_missing_data = df[~df[col].isnull()]
            X_train = non_missing_data.drop(columns=[col])
            y_train = non_missing_data[col]
            regressor = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
            regressor.fit(X_train, y_train)
            regressors[col] = regressor

    return categorical_imputer, label_encoders, regressors, categorical_cols, numeric_cols

In [None]:
categorical_imputer, label_encoders, regressors, categorical_cols, numeric_cols = create_imputation_models(filtered_train_df)

In [None]:
# Impute missing data in the test set
def impute_data(df, categorical_imputer, label_encoders, regressors, categorical_cols, numeric_cols, num_rows=None):
    if num_rows is None:
        num_rows = len(df)

    df = df.iloc[:num_rows].copy(deep=True)
    df[categorical_cols] = categorical_imputer.transform(df[categorical_cols])
    for col in categorical_cols:
        df[col] = label_encoders[col].transform(df[col])

    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            missing_data = df[df[col].isnull()]
            X_missing = missing_data.drop(columns=[col])
            df.loc[df[col].isnull(), col] = regressors[col].predict(X_missing)

    return df

In [None]:
imputed_train_df = impute_data(filtered_train_df, categorical_imputer, label_encoders, regressors, categorical_cols, numeric_cols)
imputed_test_df = impute_data(filtered_test_df, categorical_imputer, label_encoders, regressors, categorical_cols, numeric_cols)

In [None]:
# Train a decision tree model
def train_model(train_df, selected_features):
    X = train_df[selected_features]
    y = train_df['target']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('regressor', DecisionTreeRegressor(random_state=42))
    ])
    
    mlflow.set_experiment("decision-tree-regressor-experiment")
    
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)
    rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
    mae_val = mean_absolute_error(y_val, y_pred_val)
    r2_val = r2_score(y_val, y_pred_val)

    mlflow.log_metric("rmse", rmse_val)
    mlflow.log_metric("mae", mae_val)
    mlflow.log_metric("r2", r2_val)
    mlflow.sklearn.log_model(model, "Decision Tree Regression Model")

    print(f"Validation RMSE: {rmse_val}")
    return model

In [None]:
selected_features = [col for col in imputed_train_df.columns if col != 'target']
trained_model = train_model(imputed_train_df, selected_features)

In [None]:
# Test the model and log results
def test_model(model, test_df, selected_features):
    X_test = test_df[selected_features]
    y_pred_test = model.predict(X_test)

    results = pd.DataFrame({
        'id': test_df['id'],
        'predicted': y_pred_test
    })
    results.to_csv('data/processed/test_predictions.csv', index=False)
    mlflow.log_artifact('data/processed/test_predictions.csv')

In [None]:
test_model(trained_model, imputed_test_df, selected_features)