<a href="https://colab.research.google.com/github/Mithun-mondol/Python/blob/main/Assignment_05_Data_Preprocessing_and_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
#Load the Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# TODO: Load the housing data
from google.colab import drive    # import drive
drive.mount('/content/drive')     # connect to drive

import warnings
warnings.filterwarnings('ignore') # ignore waring

housing = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Housing.csv')

# TODO: How many houses are there?
print(f"Number of houses: {len(housing)}")  # show number of rows in housing dataset

# TODO: Show first 3 houses
# Your code here

# Basic information of housing dataset
print("\nBasic information of housing dataset:\n")
print("Dataset Shape: ", housing.shape)
print("\nDataset first 3 houses:\n ", housing.head(3))
print("\nDataset Description:\n ", housing.describe())
print("\nDataset Info:")
print(housing.info())

print("\nChecking missing values in the dataset:")
print(housing.isnull().sum())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of houses: 545

Basic information of housing dataset:

Dataset Shape:  (545, 13)

Dataset first 3 houses:
        price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  

Dataset Description:
                price          area    bedrooms   bathrooms     stories  \
count  5.450000e+02    545.000000  545.000000  545.000000 

In [30]:
# Look at Columns
# TODO: Show all column names
print("Columns:")
# Your code here
print(housing.columns.tolist())   # Show all column name as a list
# TODO: Show data types of columns
print("\nData types:")
print(housing.dtypes)     # Show each columns data types

Columns:
['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']

Data types:
price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object


# Label Encoding

In [31]:
#Convert Yes/No to 1/0
# Check what's in our data
print("Current mainroad values:\n", housing['mainroad'].head())
# Shows: yes, no, yes, yes, no

# Models need numbers, not text!
# We need: yes → 1, no → 0

# Creating a Label Encoder
label_encoder = LabelEncoder()

# Fit and transform the 'mainroad' column
housing['mainroad'] = label_encoder.fit_transform(housing['mainroad'])

# Fit and transform all the columns that have contains yes/no
for column in ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']:
    housing[column] = label_encoder.fit_transform(housing[column])

# Fit and transform all the columns that have contains yes/no through Loop
for col in housing.columns:
    if housing[col].dtype == 'object' and set(
                                              housing[col]
                                              .dropna()
                                              .str.lower()
                                              .unique()
                                              ).issubset({'yes', 'no'}):
        housing[col] = label_encoder.fit_transform(housing[col])

# Fit and transform the 'furnishingstatus' column
housing_encoded = pd.get_dummies(housing['furnishingstatus'], drop_first = False)

# Concatenate the encoded 'furnishingstatus' columns with the original DataFrame
housing = pd.concat([housing, housing_encoded], axis=1)

# Drop the original 'furnishingstatus' column
housing.drop('furnishingstatus', axis=1, inplace=True)

# Check the Encoded data
print("\nEncoded mainroad values:\n", housing.head())


Current mainroad values:
 0    yes
1    yes
2    yes
3    yes
4    yes
Name: mainroad, dtype: object

Encoded mainroad values:
       price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  furnished  \
0         0                0                1        2         1       True   
1         0                0                1        3         0       True   
2         1                0                0        2         1      False   
3         1                0                1        3         1       True   
4         1                0                

# Train Data

In [32]:
# Split into Train and Test
# TODO: Split - 80% train, 20% test

# Define X-Axis Data
X = housing.drop('price', axis=1)

# Define Y-Axis Data
y = housing['price']

# Train/Test split into (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,    # 20% data for test
                                                    random_state=42   # Same data used every time
                                                    )
print(f"Traning data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Traning data shape: (436, 14)
Testing data shape: (109, 14)


In [33]:
#Apply StandardScaler
# Area is huge (1650-13300), Bedrooms is small (1-6)!
#Your Code here

# Create a Scaler
scaler = StandardScaler()

# Learn from training data and scale it
X_train_scaled = scaler.fit_transform(X_train)

# Scale test data (just trandform)
X_test_scaled = scaler.transform(X_test)

# Data scaled
#print("Scaled training data:\n", X_train_scaled[:5])
#print("\nScaled test data:\n", X_test_scaled[:5])

In [34]:
#train and test model
#your code here

# Create a model
model = LogisticRegression(max_iter = 1000)

# Train the model
model.fit(X_train_scaled, y_train)

# Model trained
print("Model trained!")

# Check accuracy on train data
train_score = model.score(X_train_scaled, y_train)
print(f"Train accuracy: {train_score:.1%}")

# Check accuracy on test data
test_score = model.score(X_test_scaled, y_test)
print(f"Test accuracy: {test_score:.1%}")

# Do cross-validation of the Model
cv_scores = cross_val_score(
                            model,            # model
                            X_train_scaled,   # trained data
                            y_train,          # trained labels
                            cv=10              # 10 mini tests
                            )
print(f"Cross-validation scores: {cv_scores}")
for i in range(len(cv_scores)):
    print(f"Test {i+1}: {cv_scores[i]:.1%}")

print(f"Mean cross-validation score: {np.mean(cv_scores):.1%}")
print(f"this means our model is {cv_scores.mean():.1%} accurate!!!")

Model trained!
Train accuracy: 42.2%
Test accuracy: 1.8%
Cross-validation scores: [0.02272727 0.02272727 0.         0.02272727 0.02272727 0.04545455
 0.04651163 0.04651163 0.         0.04651163]
Test 1: 2.3%
Test 2: 2.3%
Test 3: 0.0%
Test 4: 2.3%
Test 5: 2.3%
Test 6: 4.5%
Test 7: 4.7%
Test 8: 4.7%
Test 9: 0.0%
Test 10: 4.7%
Mean cross-validation score: 2.8%
this means our model is 2.8% accurate!!!


In [35]:
import joblib
# Predict a House Price
# New house details:
# area=5000, bedrooms=3, bathrooms=2, stories=2,
# mainroad=yes, guestroom=no, basement=yes,
# hotwaterheating=no, airconditioning=yes,
# parking=2, prefarea=yes, furnishingstatus=furnished

#your code here
housing = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Housing.csv')

# Binary yes/no → 1/0 with a FIXED map
yn_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for col in yn_cols:
  housing[col] = housing[col].astype(str).str.strip().str.lower().map({'yes': 1, 'no': 0})

# One-hot for furnishingstatus
housing = pd.get_dummies(housing, columns=['furnishingstatus'], drop_first=False)

# Split features/target
x = housing.drop('price', axis=1)
y = housing['price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a REGRESSION model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

print("Train R^2: ", model.score(X_train_scaled, y_train))
print('Test R^2: ', model.score(X_test_scaled, y_test))

# Save model, scaler, and the training column order (crucial!)
joblib.dump(model, 'housing_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(x.columns.tolist(), 'features_cols.joblib')

########### ------------------ ####################

# --- Load artifacts ---
model = joblib.load('housing_model.joblib')
scaler = joblib.load('scaler.joblib')
features_cols = joblib.load('features_cols.joblib')

# New house details:
new_house = {
    "area": 5000,
    "bedrooms": 3,
    "bathrooms": 2,
    "stories": 2,
    "mainroad": "yes",
    "guestroom": "no",
    "basement": "yes",
    "hotwaterheating": "no",
    "airconditioning": "yes",
    "parking": 2,
    "prefarea": "yes",
    "furnishingstatus": "furnished",
}

# Create a dataframe with new data
new_house = pd.DataFrame([new_house])

# Scale the new data
# Creating a Label Encoder
label_encoder = LabelEncoder()

# 1) Apply the SAME fixed yes/no mapping
for col in yn_cols:
  new_house[col] = new_house[col].astype(str).str.strip().str.lower().map({'yes': 1, 'no': 0})

# One-hot furnishingstatus, then REINDEX to training columns
new_house = pd.get_dummies(new_house, columns=['furnishingstatus'], drop_first=False)
new_house = new_house.reindex(columns=features_cols, fill_value=0)

# Scale with the TRAINED scaler
new_house_scaled = scaler.transform(new_house)

# Predict the price of the new house
predicted_price = model.predict(new_house)[0]
print(f"Predicted price for the new house: {predicted_price:.2f}")

Train R^2:  0.6859438988560158
Test R^2:  0.6529242642153174
Predicted price for the new house: 2605794840.16


# **Using a single Pipeline**

In [36]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load Dataset
house = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Housing.csv')

# Split features data and label or target column
x = house.drop(columns = 'price')
y = house['price']

# Split column data type
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
categorical_cols = ['furnishingstatus']
numeric_cols = x.columns.difference(binary_cols + categorical_cols).tolist()

# Function to map yes/no → 1/0 (vectorized over DataFrame slice)
def yes_no_array(X):
    # Convert yes/no to 1/0 elementwise
    X = np.char.lower(np.array(X, dtype=str))
    return np.where(X == "yes", 1, np.where(X == "no", 0, np.nan))

binary_encoder = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("yesno_map", FunctionTransformer(yes_no_array)),
])

# Create pipeline for Categorical Columns
categorical_encoder = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),                 # Replace missing value with most frequent value
        ('onehot', OneHotEncoder(drop = 'first', handle_unknown = 'ignore'))    # Convert categorical data into number
    ]
)

# Create pipeline for Numeric Columns
numeric_encoder = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),    # Replace missing value with mean value
        ('scaler', StandardScaler())                    # standardizes numeric columns
    ]
)

# Preprocessing data
preprocessor = ColumnTransformer(
    transformers = [
        ('binary', binary_encoder, binary_cols),
        ('categorical', categorical_encoder, categorical_cols),
        ('numeric', numeric_encoder, numeric_cols)
    ],
    remainder = 'drop'
)

# Create Model
model = LinearRegression()

pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Train model
pipe.fit(X_train, y_train)

# Evaluate model
predict = pipe.predict(X_test)
print('MAE: ', mean_absolute_error(y_test, predict))
print('MSE: ', mean_squared_error(y_test, predict))
print('R^2: ', r2_score(y_test, predict))
print("Train R^2: ", pipe.score(X_train, y_train))
print('Test R^2: ', pipe.score(X_test, y_test))


# Save model
joblib.dump(pipe, 'house_model.pkl')

new_house = {
    "area": 5000, "bedrooms": 3, "bathrooms": 2, "stories": 2,
    "mainroad": "yes", "guestroom": "no", "basement": "yes",
    "hotwaterheating": "no", "airconditioning": "yes",
    "parking": 2, "prefarea": "yes", "furnishingstatus": "furnished",
}

new_house = pd.DataFrame([new_house])
pipe = joblib.load('house_model.pkl')
predicted_price = pipe.predict(new_house)[0]
print(f"\nPredicted price for the new house: {predicted_price:.2f}")

MAE:  970043.4039201641
MSE:  1754318687330.6667
R^2:  0.6529242642153178
Train R^2:  0.6859438988560158
Test R^2:  0.6529242642153178

Predicted price for the new house: 7303227.50
