# Rental Price Prediction Pipeline

In [None]:
!pip install xgboost lightgbm scikit-learn pandas numpy

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
import time
import os

In [None]:
class DummyImputer:
    def fit_transform(self, X):
        return X
    def transform(self, X):
        return X

class ImputationMethods:
    def __init__(self, random_state=42):
        self.random_state = random_state
        
    def get_imputers(self):
        return {
            'no_imputation': {
                'numeric': DummyImputer(),
                'categorical': DummyImputer()
            },
            'simple_mean': self._create_simple_mean_imputer(),
            'advanced_iterative': self._create_advanced_iterative_imputer()
        }

    def _create_simple_mean_imputer(self):
        return {
            'numeric': SimpleImputer(strategy='mean'),
            'categorical': SimpleImputer(strategy='most_frequent')
        }
    
    def _create_advanced_iterative_imputer(self):
        return {
            'numeric': IterativeImputer(
                estimator=RandomForestRegressor(n_estimators=100, random_state=self.random_state),
                random_state=self.random_state
            ),
            'categorical': SimpleImputer(strategy='most_frequent')
        }

In [None]:
def get_models(random_state=42):
    return {
        'xgboost': xgb.XGBRegressor(
            n_estimators=500,
            max_depth=7,
            learning_rate=0.05,
            subsample=0.8,
            random_state=random_state
        ),
        'lightgbm': LGBMRegressor(
            n_estimators=500,
            num_leaves=31,
            learning_rate=0.05,
            random_state=random_state
        )
    }

def get_optimized_models(random_state=42):
    return {
        'xgboost_optimized': xgb.XGBRegressor(
            n_estimators=500,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.9,
            random_state=random_state
        )
    }

In [None]:
class ModelPipeline:
    def __init__(self, imputation_methods, prediction_models, random_state=42):
        self.random_state = random_state
        self.imputation_methods = imputation_methods
        self.prediction_models = prediction_models
        self.results = {}
        
    def impute_data(self, X, numeric_features, categorical_features, imputer_dict, fit_imputer=True):
        encoded_cols = [col for col in X.columns if col.startswith('quarter_')]
        X_numeric = X[numeric_features].copy() if len(numeric_features) > 0 else pd.DataFrame()
        X_categorical = X[categorical_features].copy() if len(categorical_features) > 0 else pd.DataFrame()

        if len(numeric_features) > 0:
            if fit_imputer:
                X_numeric = pd.DataFrame(
                    imputer_dict['numeric'].fit_transform(X_numeric),
                    columns=numeric_features,
                    index=X.index
                )
            else:
                X_numeric = pd.DataFrame(
                    imputer_dict['numeric'].transform(X_numeric),
                    columns=numeric_features,
                    index=X.index
                )

        if len(categorical_features) > 0:
            if fit_imputer:
                X_categorical = pd.DataFrame(
                    imputer_dict['categorical'].fit_transform(X_categorical),
                    columns=categorical_features,
                    index=X.index
                )
            else:
                X_categorical = pd.DataFrame(
                    imputer_dict['categorical'].transform(X_categorical),
                    columns=categorical_features,
                    index=X.index
                )

        if encoded_cols:
            X_encoded = X[encoded_cols].copy()
            return pd.concat([X_numeric, X_categorical, X_encoded], axis=1)
        else:
            return pd.concat([X_numeric, X_categorical], axis=1)

    def run_pipeline(self, X_train, y_train, X_test, numeric_features, categorical_features):
        results = {}
        
        for imp_name, imputer in self.imputation_methods.items():
            print(f"\nProcessing with {imp_name} imputation...")
            
            X_train_imputed = self.impute_data(X_train, numeric_features, categorical_features, imputer, True)
            X_test_imputed = self.impute_data(X_test, numeric_features, categorical_features, imputer, False)

            for model_name, model in self.prediction_models.items():
                key = f"{imp_name}_{model_name}"
                print(f"Training {key}...")
                
                pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('model', model)
                ])
                
                cv_scores = cross_val_score(pipeline, X_train_imputed, y_train, 
                                        cv=5, scoring='neg_root_mean_squared_error')
                
                pipeline.fit(X_train_imputed, y_train)
                test_predictions = pipeline.predict(X_test_imputed)
                
                results[key] = {
                    'cv_scores': -cv_scores,
                    'cv_rmse_mean': -cv_scores.mean(),
                    'cv_rmse_std': cv_scores.std(),
                    'test_predictions': test_predictions,
                    'model': pipeline
                }
                
                print(f"Completed {key} - CV RMSE: {-cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
        
        return results

In [None]:
def load_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    for name, df in [("Training", train_df), ("Test", test_df)]:
        missing_stats = df.isnull().sum()
        missing_percentages = (missing_stats / len(df)) * 100
        
        print(f"\n{name} Dataset Missing Value Statistics:")
        for column in df.columns:
            if missing_stats[column] > 0:
                print(f"{column}: {missing_stats[column]} missing values ({missing_percentages[column]:.2f}%)")
    
    return train_df, test_df

def prepare_data(df, target_column=None):
    if target_column and target_column in df.columns:
        X = df.drop(columns=[target_column])
        y = df[target_column]
    else:
        X = df
        y = None
        
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    numeric_features = [col for col in numeric_features if not col.startswith('quarter_')]
    
    categorical_features = []
    return X, y, numeric_features, categorical_features

def save_results(results, output_dir='predictions'):
    os.makedirs(output_dir, exist_ok=True)
    
    summary_data = []
    for approach_name, result in results.items():
        test_predictions_df = pd.DataFrame({
            'ID': range(1, len(result['test_predictions']) + 1),
            'TARGET': result['test_predictions']
        })
        test_predictions_df.to_csv(f'{output_dir}/{approach_name}_test_predictions.csv', index=False)
        
        summary_data.append({
            'approach': approach_name,
            'cv_rmse_mean': result['cv_rmse_mean'],
            'cv_rmse_std': result['cv_rmse_std']
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('cv_rmse_mean')
    summary_df.to_csv(f'{output_dir}/approach_comparison.csv', index=False)
    
    print("\nApproach Comparison:")
    print(summary_df.to_string(index=False))

In [None]:
def process_quarters(train_df, test_df):
    train_quarters = pd.get_dummies(train_df['quarter'], prefix='quarter')
    test_quarters = pd.get_dummies(test_df['quarter'], prefix='quarter')
    
    missing_in_train = set(test_quarters.columns) - set(train_quarters.columns)
    missing_in_test = set(train_quarters.columns) - set(test_quarters.columns)
    
    for col in missing_in_train:
        train_quarters[col] = 0
    for col in missing_in_test:
        test_quarters[col] = 0
        
    train_quarters = train_quarters[sorted(train_quarters.columns)]
    test_quarters = test_quarters[sorted(test_quarters.columns)]
    
    train_df = train_df.drop(columns=['quarter'])
    test_df = test_df.drop(columns=['quarter'])
    
    quarter_cols = sorted(train_quarters.columns)[1:]
    train_df = pd.concat([train_df, train_quarters[quarter_cols]], axis=1)
    test_df = pd.concat([test_df, test_quarters[quarter_cols]], axis=1)
    
    return train_df, test_df

In [None]:
# Main execution
train_df, test_df = load_data("pzn-rent-train-processed.csv", "pzn-rent-test-processed.csv")
train_df, test_df = process_quarters(train_df, test_df)

X_train, y_train, numeric_features, categorical_features = prepare_data(train_df, 'price')
X_test, _, _, _ = prepare_data(test_df)

imputer = ImputationMethods()
imputation_methods = {
    'no_imputation': imputer.get_imputers()['no_imputation'],
    'simple_mean': imputer.get_imputers()['simple_mean'],
    'advanced_iterative': imputer.get_imputers()['advanced_iterative']
}

base_models = get_models()
optimized_models = get_optimized_models()

selected_models = {
    'xgboost': base_models['xgboost'],
    'lightgbm': base_models['lightgbm'],
    'xgboost_optimized': optimized_models['xgboost_optimized']
}

pipeline = ModelPipeline(imputation_methods, selected_models)
results = pipeline.run_pipeline(X_train, y_train, X_test, numeric_features, categorical_features)

save_results(results)