# Model Training

In [1]:
import pandas as pd

df = pd.read_csv('housing_price_dataset.csv')
print(df.head())
print(df.info())


   SquareFeet  Bedrooms  Bathrooms Neighborhood  YearBuilt          Price
0        2126         4          1        Rural       1969  215355.283618
1        2459         3          2        Rural       1980  195014.221626
2        1860         2          1       Suburb       1970  306891.012076
3        2294         2          1        Urban       1996  206786.787153
4        2130         5          2       Suburb       2001  272436.239065
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB
None


# Logistic Regression 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('housing_price_dataset.csv')

# --- 1. Feature Engineering ---
# Create a new 'Age' feature
current_year = 2024 # You can set this to the year the data was collected
df['Age'] = current_year - df['YearBuilt']

# Create a 'Rooms' interaction feature
df['TotalRooms'] = df['Bedrooms'] + df['Bathrooms']

# One-hot encode the 'Neighborhood' column
df = pd.get_dummies(df, columns=['Neighborhood'], drop_first=True)

# Create a binary target variable for classification
median_price = df['Price'].median()
df['Is_Expensive'] = (df['Price'] > median_price).astype(int)

# Define features (X) and target (y)
# We add our new features and drop the original 'YearBuilt'
features = ['SquareFeet', 'Bedrooms', 'Bathrooms', 'Age', 'TotalRooms'] + [col for col in df.columns if 'Neighborhood_' in col]
X = df[features]
y = df['Is_Expensive']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# --- 2. Feature Scaling ---
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Only transform the test data

# --- 3. Hyperparameter Tuning with GridSearchCV ---
# Define the model
log_model = LogisticRegression(max_iter=5000)

# Define the grid of parameters to search
# We'll test different values for the regularization parameter 'C'
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Create the GridSearchCV object
# cv=5 means 5-fold cross-validation
grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Get the best estimator (the model with the best parameters)
best_model = grid_search.best_estimator_

# --- 4. Evaluation ---
# Make predictions with the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Improved Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(class_report)

Best Hyperparameters: {'C': 0.001}
Improved Accuracy: 0.8025

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      9980
           1       0.80      0.80      0.80     10020

    accuracy                           0.80     20000
   macro avg       0.80      0.80      0.80     20000
weighted avg       0.80      0.80      0.80     20000



# Decision Tree Classifier

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import io

# 1. Load and Prepare the Data
# Correctly loads the data from the uploaded file
df = pd.read_csv("housing_price_dataset.csv")

# 2. Feature Engineering
current_year = pd.to_datetime('today').year
df['Age'] = current_year - df['YearBuilt']
df['TotalRooms'] = df['Bedrooms'] + df['Bathrooms']
df = pd.get_dummies(df, columns=['Neighborhood'], drop_first=True)

# 3. Define Target and Features for a Classification Task
median_price = df['Price'].median()
df['Is_Expensive'] = (df['Price'] > median_price).astype(int)
features = ['SquareFeet', 'Bedrooms', 'Bathrooms', 'Age', 'TotalRooms'] + [col for col in df.columns if 'Neighborhood_' in col]
X = df[features]
y = df['Is_Expensive']

# 4. Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Hyperparameter Tuning with GridSearchCV
print("Starting hyperparameter tuning for Decision Tree...")
# Define the grid of parameters to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 15, 20, 25],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}
# Create and configure the GridSearchCV object
dt_classifier = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid,
                           cv=5, n_jobs=-1, scoring='accuracy', verbose=1)
# Find the best model
grid_search.fit(X_train, y_train)

# 6. Get the Best Model and Evaluate its Performance
best_dt_model = grid_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)

# --- 7. Print the Results ---
print("\n--- Hyperparameter Tuning Complete ---")
print(f"Best Parameters Found: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
print("\n--- Final Model Performance on Test Data ---")
print(f"Test Set Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)

Starting hyperparameter tuning for Decision Tree...
Fitting 5 folds for each of 72 candidates, totalling 360 fits

--- Hyperparameter Tuning Complete ---
Best Parameters Found: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 20}
Best Cross-Validation Accuracy: 0.7924

--- Final Model Performance on Test Data ---
Test Set Accuracy: 0.7904

Classification Report:
              precision    recall  f1-score   support

           0     0.7923    0.7872    0.7897      5000
           1     0.7886    0.7936    0.7911      5000

    accuracy                         0.7904     10000
   macro avg     0.7904    0.7904    0.7904     10000
weighted avg     0.7904    0.7904    0.7904     10000



# Random Forest Classifier

In [4]:
df.columns

Index(['SquareFeet', 'Bedrooms', 'Bathrooms', 'YearBuilt', 'Price', 'Age',
       'TotalRooms', 'Neighborhood_Suburb', 'Neighborhood_Urban',
       'Is_Expensive'],
      dtype='object')

In [5]:
df_encoded = pd.get_dummies(df, drop_first=True)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

df=pd.read_csv('housing_price_dataset.csv')
# Define features and target


label_encoders = {}
for col in ['Neighborhood']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df['Neighborhood'])
    label_encoders[col] = le


X = df.drop('Neighborhood', axis=1)  # Replace 'price' with your target column
y = df['Neighborhood']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)


# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Random Forest MSE: {mse:.2f}')



Random Forest MSE: 0.71


# SVM (Support Vector Machine)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
from sklearn.svm import SVR

svm = SVR(kernel='rbf', C=100, gamma=0.1)
svm.fit(X_train_scaled, y_train)


In [10]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = svm.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f'SVM R² Score: {r2:.2f}')
print(f'SVM RMSE: {mse**0.5:.2f}')



SVM R² Score: -0.01
SVM RMSE: 0.82


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
