In [18]:
import pandas as pd 
dataset=pd.read_csv("housing.csv")
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [19]:
numerical_cols = dataset.select_dtypes(include='number').columns
categorical_cols = dataset.select_dtypes(include='object').columns

print(numerical_cols)
print(categorical_cols)

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking'], dtype='object')
Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


# **Data Preprocessing**

In [20]:
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()

In [21]:
dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])
dataset[categorical_cols] = ordinal_encoder.fit_transform(dataset[categorical_cols])
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,1.0,0.0,0.0,0.0,1.0,1.517692,1.0,0.0
1,4.004484,1.75701,1.403419,5.405809,2.532024,1.0,0.0,0.0,0.0,1.0,2.679409,0.0,0.0
2,4.004484,2.218232,0.047278,1.421812,0.22441,1.0,0.0,1.0,0.0,0.0,1.517692,1.0,1.0
3,3.985755,1.083624,1.403419,1.421812,0.22441,1.0,0.0,1.0,0.0,1.0,2.679409,1.0,0.0
4,3.554979,1.046726,1.403419,-0.570187,0.22441,1.0,1.0,1.0,0.0,1.0,1.517692,0.0,0.0


In [22]:
target_var = 'prefarea'
dataset[target_var].value_counts()

prefarea
0.0    417
1.0    128
Name: count, dtype: int64

In [23]:
X = dataset.drop(columns=target_var)
y = dataset[target_var]

X.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,1.0,0.0,0.0,0.0,1.0,1.517692,0.0
1,4.004484,1.75701,1.403419,5.405809,2.532024,1.0,0.0,0.0,0.0,1.0,2.679409,0.0
2,4.004484,2.218232,0.047278,1.421812,0.22441,1.0,0.0,1.0,0.0,0.0,1.517692,1.0
3,3.985755,1.083624,1.403419,1.421812,0.22441,1.0,0.0,1.0,0.0,1.0,2.679409,0.0
4,3.554979,1.046726,1.403419,-0.570187,0.22441,1.0,1.0,1.0,0.0,1.0,1.517692,0.0


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [25]:
# pip install imblearn

In [28]:
pd.Series(y_train).value_counts()

prefarea
0.0    334
1.0    102
Name: count, dtype: int64

In [29]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Resampled class distribution:", pd.Series(y_train).value_counts())

Resampled class distribution: prefarea
0.0    334
1.0    334
Name: count, dtype: int64


In [30]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [31]:
y_pred = decision_tree.predict(X_test)
print(y_pred[:10])

[0. 1. 0. 0. 0. 1. 0. 0. 0. 0.]


In [32]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6880733944954128


In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

""" Initialize model """
knn_model = KNeighborsClassifier()

""" Train the model """
knn_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = knn_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"KNN Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

KNN Classifier Accuracy: 0.6238532110091743
Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.64      0.72        83
         1.0       0.33      0.58      0.42        26

    accuracy                           0.62       109
   macro avg       0.58      0.61      0.57       109
weighted avg       0.71      0.62      0.65       109



In [34]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

""" Initialize model """
svm_model = SVC(probability=True, random_state=42)

""" Train the model """
svm_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = svm_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"SVM Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

SVM Classifier Accuracy: 0.6788990825688074
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.73      0.78        83
         1.0       0.37      0.50      0.43        26

    accuracy                           0.68       109
   macro avg       0.60      0.62      0.60       109
weighted avg       0.72      0.68      0.69       109



# **Ensemble learning**

In [35]:
""" Boosting
Boosting => Models are trained sequentially

Suppose,
decision_1.fit(X_train, y_train)
decision_2.fit(incorrect_predicted_rows)
decision_3.fit(incorrect_predicted_rows)
"""


from sklearn.ensemble import AdaBoostClassifier

""" Initialize model """
boosting_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=3,
    random_state=42
)

""" Train the model """
boosting_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = boosting_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Boosting Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Boosting Classifier Accuracy: 0.6972477064220184
Classification Report:
               precision    recall  f1-score   support

         0.0       0.85      0.73      0.79        83
         1.0       0.41      0.58      0.48        26

    accuracy                           0.70       109
   macro avg       0.63      0.66      0.63       109
weighted avg       0.74      0.70      0.71       109



In [37]:
""" Bagging
Models are learned idividually
single decision tree: accuracy: 71%

X_train
y_train

multiple decision tree:
   decision_tree_1.fit(X_train_1, y_train_1)
   decision_tree_2.fit(X_train_2, y_train_2)
   ...
   decision_tree_N.fit(X_train_N, y_train_N)

   predict(x):
      y_pred_1 = decision_tree_1.predict(x)
      y_pred_2 = decision_tree_2.predict(x)
      ...

      return the majority prediction
"""


from sklearn.ensemble import BaggingClassifier

""" Initialize model """
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

""" Train the model """
bagging_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = bagging_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Bagging Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Bagging Classifier Accuracy: 0.8165137614678899
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.90      0.88        83
         1.0       0.64      0.54      0.58        26

    accuracy                           0.82       109
   macro avg       0.75      0.72      0.73       109
weighted avg       0.81      0.82      0.81       109



In [38]:
"""
Stacking is an ensemble learning technique that combines predictions from multiple base models (of different types) to improve performance.
"""
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

estimators = [
    ('decision_tree', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier()),
    ('svm', SVC(probability=True, random_state=42))
]

""" Initialize model """
stacking_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())

""" Train the model """
stacking_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = stacking_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Bagging Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Bagging Classifier Accuracy: 0.6880733944954128
Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.77      0.79        83
         1.0       0.37      0.42      0.39        26

    accuracy                           0.69       109
   macro avg       0.59      0.60      0.59       109
weighted avg       0.70      0.69      0.70       109



In [39]:
""" Voting """

from sklearn.ensemble import VotingClassifier

""" Initialize model """

voting_model = VotingClassifier(
    estimators=[
        ('decision_tree', DecisionTreeClassifier()),
        ('knn', KNeighborsClassifier()),
        ('svm', SVC(probability=True))],
    voting='soft'  # Use 'hard' for majority voting
)

""" Train the model """
voting_model.fit(X_train, y_train)

""" Classify the instances """
y_pred = voting_model.predict(X_test)

""" Evaluate the models' performance """
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

""" Print accuracy and classification report """
print(f"Voting Classifier Accuracy: {accuracy}")
print("Classification Report:\n", report)

Voting Classifier Accuracy: 0.7247706422018348
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.78      0.81        83
         1.0       0.44      0.54      0.48        26

    accuracy                           0.72       109
   macro avg       0.64      0.66      0.65       109
weighted avg       0.75      0.72      0.73       109

