# **AMES HOUSE PRICE PREDICTION SKLEARN PIPELINE**

## **Imports**

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

## **Data**

In [None]:
df = pd.read_csv("ames.csv")

df.head()

In [None]:
df.info()

In [None]:
# Create Train / Test Split 
X_train = df.iloc[:2000].copy()
X_test = df.iloc[2001:].copy()


y_train = df['Sale_Price'].iloc[:2000]
y_test = df['Sale_Price'].iloc[2001:]

# Drop Sale Price From X Data Sets
X_train.drop(columns=['Sale_Price'], inplace=True)
X_test.drop(columns=['Sale_Price'], inplace=True)

# Print Out The Set Sizes
print("Train: " + str(len(y_train)))
print("Test: " + str(len(y_test)))

In [None]:
y_train

## **Training Pipeline**

In [None]:
# Divide Into Categorical And Numeric Features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numeric:", numeric_features)
print("Categoric:", categorical_features)

### Preprocessing Pipline

In [None]:
# Create Preprocessing Pipeline

# Numeric Features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical Features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create Preprocessing Transformer Pipline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create Full Training Pipline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

### Grid Search

In [None]:
# Models That Should Be Evaluated & Hyperparameter Grid For GridSearch

param_grid = [
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [5, 10]
    },
    {
        'classifier': [GradientBoostingClassifier()],
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__max_depth': [3, 5]
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=3)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
results = pd.DataFrame(grid_search.cv_results_)

for classifier in [RandomForestClassifier, LogisticRegression]:
    classifier_results = results[results['param_classifier'].apply(lambda x: isinstance(x, classifier))]

    best_row = classifier_results.loc[classifier_results['rank_test_score'].idxmin()]
    best_params = best_row.loc[[param for param in classifier_results.columns if 'param_' in param and best_row[param] is not None]]

    print(f"Best parameters for {classifier.__name__}:")
    print(best_params)
    print()

    

In [None]:
report = classification_report(y_test, y_train)
print(report)