In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Load Data
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Step 2: Clean 'total_sqft'
def clean_total_sqft(value):
    try:
        if '-' in str(value):
            return float(value.split('-')[-1].strip())
        return float(''.join([char for char in str(value) if char.isdigit() or char == '.']))
    except ValueError:
        return np.nan

# Step 3: Preprocess Data
def preprocess_data(data):
    # Handle missing values
    data['total_sqft'] = data['total_sqft'].apply(clean_total_sqft)
    data = data.dropna()

    # Create price per sqft feature
    data['price_per_sqft'] = data['price'] / data['total_sqft']

    # Encode categorical variables
    label_encoders = {}
    for column in ['area_type', 'availability', 'location', 'size', 'society']:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

    return data

# Step 4: Feature Engineering
def feature_engineering(data):
    # Normalize numeric features (optional)
    scaler = StandardScaler()
    numeric_features = ['total_sqft', 'bath', 'balcony', 'price_per_sqft']
    data[numeric_features] = scaler.fit_transform(data[numeric_features])

    return data

# Step 5: Model Training and Evaluation
def train_and_evaluate_model(data):
    # Separate features and target
    X = data.drop(columns=['ID', 'price'])
    y = data['price']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Random Forest Regressor
    model = RandomForestRegressor(random_state=42)

    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }

    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best model
    best_model = grid_search.best_estimator_

    # Predict on test set
    y_pred = best_model.predict(X_test)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return best_model, rmse

# Main Execution
if __name__ == "__main__":
    # Load the data
    file_path = "train.csv"  # Replace with actual file path
    data = load_data(file_path)

    # Preprocess the data
    data = preprocess_data(data)

    # Engineer features
    data = feature_engineering(data)

    # Train and evaluate the model
    best_model, rmse = train_and_evaluate_model(data)

    print("Best Model:", best_model)
    print(f"Root Mean Squared Error (RMSE): {rmse}")


In [None]:
label_encoders = {}
categorical_cols = processed_train_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print("for te col: ", col)
    le = LabelEncoder()
    processed_train_data[col] = le.fit_transform(processed_train_data[col].astype(str))
    processed_test_data[col] = le.transform(processed_test_data[col].astype(str))
    label_encoders[col] = le