In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.io as pio
import seaborn as sns
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay


loading the dataset using pandas library

In [2]:
df=pd.read_csv(r"C:\Users\Allan\OneDrive\Desktop\House Price Prediction Dataset.csv")


Exploratory Data Analysis

In [3]:
#checking the columns and rows of our dataset
df.shape

(2000, 10)

In [4]:
#checking the first 5 rows of our dataset to understand how it looks like.
df.head()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


Great! There is a mix of categorical and numerical values in this dataset. Note that we can not pass categorical variables directly to our machine learning model. We will have to encode them before model training. Let us go ahead and check the data types of our attributes.

In [5]:
df.dtypes

Id            int64
Area          int64
Bedrooms      int64
Bathrooms     int64
Floors        int64
YearBuilt     int64
Location     object
Condition    object
Garage       object
Price         int64
dtype: object



3 of the attributes have a data type of object which signifies that they are categorical in nature while the rest of them are either float or int which can be directly passed during the model training.

In [6]:
df.isna().sum()

Id           0
Area         0
Bedrooms     0
Bathrooms    0
Floors       0
YearBuilt    0
Location     0
Condition    0
Garage       0
Price        0
dtype: int64

It seems we do not have null values in our dataset

In [7]:
df= df.select_dtypes(include=['object'])
df.columns

Index(['Location', 'Condition', 'Garage'], dtype='object')

In [8]:
for cols in df.columns:
    print(cols,'-', len(df[cols].unique()),'Labels')

Location - 4 Labels
Condition - 4 Labels
Garage - 2 Labels


Since it is a single CSV file, it is better to split our dataset into train and test so that we can keep aside the test dataset for calculating the accuracy in later stages. We are using a 70-30 ratio for the train:test. The random_state variables ensure that these instances are picked randomly to minimize any bias or skewness.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Location   2000 non-null   object
 1   Condition  2000 non-null   object
 2   Garage     2000 non-null   object
dtypes: object(3)
memory usage: 47.0+ KB


In [10]:
df.duplicated().sum()

np.int64(1968)

when have one duplicate in our dataset, lets drop it.

In [11]:
df=df.drop_duplicates()
df

Unnamed: 0,Location,Condition,Garage
0,Downtown,Excellent,No
2,Downtown,Good,No
3,Suburban,Fair,Yes
4,Downtown,Fair,Yes
5,Urban,Poor,No
6,Rural,Poor,Yes
7,Suburban,Good,Yes
9,Downtown,Poor,No
10,Suburban,Poor,Yes
11,Urban,Excellent,Yes


In [12]:
df.duplicated().sum()

np.int64(0)

In [13]:
#getting some statistical measures about the dataset
df.describe()

Unnamed: 0,Location,Condition,Garage
count,32,32,32
unique,4,4,2
top,Downtown,Excellent,No
freq,8,8,16


In [14]:
train, test = train_test_split(df,test_size=0.3,random_state= 1234)

In [18]:
print(train.columns)

Index(['Location', 'Condition', 'Garage'], dtype='object')


In [24]:
X = train[['Location', 'Condition', 'Garage']]


In [25]:
labels = [x for x in train.Location.value_counts().index]
values = train.Location.value_counts()

Distribution of Data by Location

In [35]:
pio.renderers.default = "browser"

# Your plot code
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])

fig.update_layout(
    title_text="Distribution of data by Location (in %)"
)
fig.update_traces()
fig.show()

Distribution of data by Condition

In [37]:

fig = px.histogram(
    df,
    x="Condition",
    hover_data=df.columns,
    title="Distribution of Data by Condition",
    color="Condition",  # Colors each bar by 'Condition' category
    color_discrete_sequence=px.colors.qualitative.Pastel  # Use a pleasant color scheme
)

# Update layout for a cleaner appearance
fig.update_layout(
    title={
        'text': "Distribution of Data by Condition",
        'x': 0.5,  # Center the title
        'xanchor': 'center'
    },
    xaxis_title="Condition",
    yaxis_title="Count",
    font=dict(
        family="Arial",
        size=14,
        color="black"
    ),
    plot_bgcolor="rgba(0, 0, 0, 0)",  # Transparent background
    paper_bgcolor="rgba(0, 0, 0, 0)",
)

# Show the figure
fig.show()

Distribution of Data by Garage

In [39]:
# Create the box plot with improved styling
fig = px.bar(
    df,
    x="Garage",
    color="Garage",  # Color by 'Garage' category
    hover_data=df.columns,
    title="Distribution of Data by Garage",
    color_discrete_sequence=px.colors.qualitative.Pastel  # Use a pleasant color scheme
)

# Update layout for a cleaner appearance
fig.update_layout(
    title={
        'text': "Distribution of Data by Garage",
        'x': 0.5,  # Center the title
        'xanchor': 'center'
    },
    xaxis_title="Garage",
    yaxis_title="Values",  # Generic y-axis title
    font=dict(
        family="Arial",
        size=14,
        color="black"
    ),
    plot_bgcolor="rgba(0, 0, 0, 0)",  # Transparent background
    paper_bgcolor="rgba(0, 0, 0, 0)",
)

# Show the figure
fig.show()