In [1]:
DATA_PATH = "../data/Processed/V3Processed.csv"
FIG_DIR = "../report/Figures"


In [2]:
import pandas as pd 
import numpy as np 
import os 
import seaborn as sns 
import plotly.express as px 
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.impute import KNNImputer
from datasist.structdata import detect_outliers

In [3]:
df = pd.read_pickle(DATA_PATH)
df.shape

(7629, 12)

In [4]:
df.sample(3)

Unnamed: 0,year,locality,estimated value ($),sale price ($),property,residential,num_rooms,num_bathrooms,carpet_area_sq_ft,face,month_name,tax_value
8525,2021,Fairfield,306670.0,398000.0,Single Family,Detached House,3,1,967.0,South,May,4073.51209
4939,2016,Fairfield,595910.0,840000.0,Single Family,Detached House,3,1,970.0,South,August,11325.3756
3371,2014,Unknown,155740.0,56500.0,Two Family,Duplex,4,2,1422.0,South,May,763.212735


# Univariate Analysis

## What is the distribution of sale prices?

In [5]:
fig1 = px.box(df, y='sale price ($)', title='Distribution of Sale prices')
fig1.update_layout(yaxis_title='Sale price ($)', xaxis_title='')
fig1.show()

In [6]:
otlr_idx = detect_outliers(df , 0 , ['sale price ($)'])
len(otlr_idx)

762

In [7]:
fig1.write_html(os.path.join(FIG_DIR, 'Sale Price.html'))

In [8]:
Q1 = df['sale price ($)'].quantile(0.25)
Q3 = df['sale price ($)'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['sale price ($)'] < lower_bound) | (df['sale price ($)'] > upper_bound)]

outliers_df = outliers.copy()

In [9]:
outliers_df

Unnamed: 0,year,locality,estimated value ($),sale price ($),property,residential,num_rooms,num_bathrooms,carpet_area_sq_ft,face,month_name,tax_value
14,2009,Greenwich,2095100.0,2860000.0,Single Family,Detached House,3,1,963.0,East,January,29342.255800
31,2009,Greenwich,2215010.0,2925000.0,Single Family,Detached House,3,2,1065.0,West,January,30009.125250
44,2009,Greenwich,875700.0,1372500.0,Single Family,Detached House,3,1,1079.0,East,January,14081.204925
45,2009,Greenwich,1452500.0,2000000.0,Single Family,Detached House,3,2,973.0,North,January,20519.060000
54,2009,Greenwich,1152270.0,1750000.0,Single Family,Detached House,3,3,1040.0,South,January,17954.177500
...,...,...,...,...,...,...,...,...,...,...,...,...
9968,2022,Greenwich,1074290.0,1700000.0,Single Family,Detached House,3,1,934.0,South,September,17067.643000
9972,2022,Fairfield,1424920.0,3300000.0,Single Family,Detached House,3,2,1033.0,West,September,33131.307000
9980,2022,Unknown,1069740.0,1605000.0,Single Family,Detached House,3,2,969.0,East,September,16113.862950
9988,2022,Stamford,700000.0,1575000.0,Single Family,Detached House,3,1,1073.0,North,September,15812.669250


In [10]:
outliers_df['year'].unique()

array([2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019,
       2020, 2021, 2022], dtype=int64)

In [11]:
fig_original = px.box(df, y='sale price ($)', title='Original Distribution of Sale Prices')
fig_original.show()

median_price = df['sale price ($)'].median()

# Replace outliers (greater than 8.5M) with the median
df['sale price ($)'] = df['sale price ($)'].apply(lambda x: median_price if x > 7500000 else x)

fig_handled = px.box(df, y='sale price ($)', title='Distribution of Sale Prices after Handling Outliers')
fig_handled.show()

In [12]:
fig_handled.write_html(os.path.join(FIG_DIR, 'Sale Price Handling.html'))

In [13]:
Q1 = df['sale price ($)'].quantile(0.25)
Q3 = df['sale price ($)'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['sale price ($)'] < lower_bound) | (df['sale price ($)'] > upper_bound)]

outliers_df = outliers.copy()

In [14]:
outliers_df

Unnamed: 0,year,locality,estimated value ($),sale price ($),property,residential,num_rooms,num_bathrooms,carpet_area_sq_ft,face,month_name,tax_value
14,2009,Greenwich,2095100.0,2860000.0,Single Family,Detached House,3,1,963.0,East,January,29342.255800
31,2009,Greenwich,2215010.0,2925000.0,Single Family,Detached House,3,2,1065.0,West,January,30009.125250
44,2009,Greenwich,875700.0,1372500.0,Single Family,Detached House,3,1,1079.0,East,January,14081.204925
45,2009,Greenwich,1452500.0,2000000.0,Single Family,Detached House,3,2,973.0,North,January,20519.060000
54,2009,Greenwich,1152270.0,1750000.0,Single Family,Detached House,3,3,1040.0,South,January,17954.177500
...,...,...,...,...,...,...,...,...,...,...,...,...
9968,2022,Greenwich,1074290.0,1700000.0,Single Family,Detached House,3,1,934.0,South,September,17067.643000
9972,2022,Fairfield,1424920.0,3300000.0,Single Family,Detached House,3,2,1033.0,West,September,33131.307000
9980,2022,Unknown,1069740.0,1605000.0,Single Family,Detached House,3,2,969.0,East,September,16113.862950
9988,2022,Stamford,700000.0,1575000.0,Single Family,Detached House,3,1,1073.0,North,September,15812.669250


In [15]:
outliers_df['locality'].value_counts()

locality
Greenwich        465
Fairfield         98
Unknown           82
Norwalk           48
Stamford          46
West Hartford      3
Bridgeport         1
Name: count, dtype: int64

## What is the distribution of locality ?

In [16]:
fig2=px.histogram(data_frame=df , x = df['locality'].astype(str) ,histfunc='count' , 
             text_auto=True ).update_xaxes(categoryorder='total ascending')
fig2.show()

In [17]:
fig2.write_html(os.path.join(FIG_DIR, 'Locality.html'))

In [18]:
# Create a new DataFrame with selected columns
sub_df = df[['locality', 'sale price ($)', 'carpet_area_sq_ft']].copy()
sub_df

Unnamed: 0,locality,sale price ($),carpet_area_sq_ft
0,Waterbury,185000.0,996.0
1,Unknown,152000.0,935.0
2,Waterbury,105000.0,951.0
4,Bridgeport,272900.0,971.0
7,Norwalk,409000.0,1004.0
...,...,...,...
9994,Unknown,415000.0,1014.0
9995,Unknown,215000.0,2114.0
9997,West Hartford,225635.0,922.0
9998,Unknown,2625000.0,977.0


In [19]:
avg_values = sub_df.groupby('locality').agg({'sale price ($)': 'mean', 'carpet_area_sq_ft': 'mean'}).reset_index()
fig= px.bar(avg_values, 
            x='locality', 
             y='sale price ($)', 
             color='carpet_area_sq_ft',
             title='Average Sale Price and Carpet Area by Locality',
             labels={'locality': 'Locality', 'sale price ($)': 'Average Sale Price ($)', 'carpet_area_sq_ft': 'Carpet Area (sq ft)'},
             barmode='group',
             text_auto=True)
fig.show()

## What is the distribution of year ?

In [20]:
fig3=px.histogram(data_frame=df , x = 'year' ,text_auto=True )
fig3.update_layout(bargap=0.2)
fig3.show()

In [21]:
fig3.write_html(os.path.join(FIG_DIR, 'Year.html'))

## What is the distribution of property ?

In [22]:
fig4=px.histogram(data_frame=df , x = df['property'].astype(str) , text_auto=True )
fig4.show()

In [23]:
fig4.write_html(os.path.join(FIG_DIR, 'Property.html'))

## What is the distribution of Nummber Of Bathrooms ?

In [24]:
fig5 = px.histogram(df, x='num_bathrooms', title='Histogram of Number of Rooms', text_auto=True).update_xaxes(categoryorder='total ascending')
fig5.update_layout(bargap=0.2)
fig5.show()

In [25]:
otlr_idx = detect_outliers(df , 0 , ['num_bathrooms'])

In [26]:
len(otlr_idx)

51

In [27]:
df.loc[otlr_idx]['property'].unique()

array(['Four Family'], dtype=object)

In [28]:
fig5.write_html(os.path.join(FIG_DIR, 'Number Of Bathrooms.html'))

## What is the distribution of residential ?

In [29]:
df['residential'].value_counts(normalize=True)*100

residential
Detached House    82.920435
Duplex            10.119282
Triplex            5.937869
Fourplex           1.022414
Name: proportion, dtype: float64

In [30]:
fig6=px.histogram(df, x='residential', histfunc='count' , text_auto=True).update_xaxes(categoryorder='total ascending')
fig6.show()

In [31]:
fig6.write_html(os.path.join(FIG_DIR, 'Residential.html'))

## What is the distribution of Number Of Rooms ?

In [32]:
fig7=px.pie(data_frame=df , names = 'num_rooms')
fig7.show()

In [33]:
fig7.write_html(os.path.join(FIG_DIR, 'Number Of Rooms.html'))

## What is the distribution of face ?

In [34]:
fig8=px.pie(data_frame=df , names = 'face')
fig8.show()

In [35]:
fig8.write_html(os.path.join(FIG_DIR, 'Face.html'))

## What is the distribution of tax value ?


In [36]:
fig9= px.box(df, y='tax_value', title='Distribution of tax_value')
fig9.update_layout(yaxis_title='tax_value', xaxis_title='')
fig9.show()

In [37]:
Q1 = df['tax_value'].quantile(0.25)
Q3 = df['tax_value'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['tax_value'] < lower_bound) | (df['tax_value'] > upper_bound)]

outliers_df = outliers.copy()

In [38]:
outliers_df

Unnamed: 0,year,locality,estimated value ($),sale price ($),property,residential,num_rooms,num_bathrooms,carpet_area_sq_ft,face,month_name,tax_value
14,2009,Greenwich,2095100.0,2860000.0,Single Family,Detached House,3,1,963.0,East,January,29342.25580
31,2009,Greenwich,2215010.0,2925000.0,Single Family,Detached House,3,2,1065.0,West,January,30009.12525
45,2009,Greenwich,1452500.0,2000000.0,Single Family,Detached House,3,2,973.0,North,January,20519.06000
54,2009,Greenwich,1152270.0,1750000.0,Single Family,Detached House,3,3,1040.0,South,January,17954.17750
63,2009,Norwalk,350000.0,1600000.0,Single Family,Detached House,3,2,904.0,East,January,16415.24800
...,...,...,...,...,...,...,...,...,...,...,...,...
9968,2022,Greenwich,1074290.0,1700000.0,Single Family,Detached House,3,1,934.0,South,September,17067.64300
9972,2022,Fairfield,1424920.0,3300000.0,Single Family,Detached House,3,2,1033.0,West,September,33131.30700
9980,2022,Unknown,1069740.0,1605000.0,Single Family,Detached House,3,2,969.0,East,September,16113.86295
9988,2022,Stamford,700000.0,1575000.0,Single Family,Detached House,3,1,1073.0,North,September,15812.66925


In [39]:
outliers_df['locality'].value_counts()

locality
Greenwich        475
Fairfield        102
Unknown           82
Stamford          49
Norwalk           46
West Hartford      3
Bridgeport         1
Name: count, dtype: int64

In [40]:
locality_mean = outliers_df.groupby('locality')['tax_value'].mean().sort_values()
locality_mean_df = locality_mean.reset_index()
fig9_1=px.bar(locality_mean_df, x='locality', y='tax_value', 
             title='tax value Mean in Each Locality',
             labels={'locality': 'Locality', 'tax_value': 'Mean tax value'},
             color='tax_value',text_auto=True,
             color_continuous_scale='Viridis')
fig9_1.show()

In [41]:
fig9.write_html(os.path.join(FIG_DIR, 'tax value.html'))

## What is the distribution of Month Name ?


In [42]:
# fig5 = px.histogram(
#     df, 
#     x='month_name', 
#     title='Histogram of Month Name for Each Year',
#     text_auto=True,
# facet_col='year',
# )

# fig5.update_layout(bargap=0.2)  

# fig5.show()

In [43]:
fig5 = px.histogram(df, x='month_name', title='Histogram of Month Name', text_auto=True)
fig5.update_layout(bargap=0.2)
fig5.show()

# Bivariate Analysis

In [44]:
px.imshow(df.select_dtypes('number').corr() , text_auto=True)

## Distribution for sale price and estimated value ?

In [45]:
fig10 = px.scatter(df, x='estimated value ($)', y='sale price ($)', color='estimated value ($)',
                 title='House prices in the DataFrame', opacity=0.8)
fig10.update_layout(xaxis_title='Estimated Value', yaxis_title='Sale price')
fig10.show()


In [46]:
fig10.write_html(os.path.join(FIG_DIR, 'Estimated Value And Sale Price Using scatter Plot.html'))

- After seeing this plot I realize there was Estimated Values of $0$, so I eliminated them.

In [47]:
df['estimated value ($)'].sort_values(ascending=True)
df = df.drop(df[df['estimated value ($)'] < 3499].index)

In [48]:
df['estimated value ($)'].sort_values(ascending=True)

287         3500.0
513         3500.0
414         9100.0
5274       10650.0
234        11830.0
           ...    
1517    10258290.0
3841    13523370.0
4808    13523370.0
5301    13793465.0
431     21119910.0
Name: estimated value ($), Length: 7625, dtype: float64

- handling outliers

In [49]:
df_filtered = df[df['estimated value ($)'] <= 8500000]

# Create the scatter plot after filtering
fig_filtered = px.scatter(df_filtered, x='estimated value ($)', y='sale price ($)', color='estimated value ($)',
                          title='House prices in the DataFrame (Filtered)', opacity=0.8)

fig_filtered.update_layout(xaxis_title='Estimated Value', yaxis_title='Sale Price')

fig_filtered.show()

In [50]:
fig_filtered.write_html(os.path.join(FIG_DIR, 'Relationship between Estimated Value and Sale Price Using scatter Plot.html'))

In [51]:
grouped_df = df_filtered.groupby('year')[['sale price ($)', 'estimated value ($)']].mean().reset_index()
fig10_1= go.Figure()

# Add the line for sale price
fig10_1.add_trace(go.Scatter(x=grouped_df['year'], y=grouped_df['sale price ($)'], mode='lines+markers', name='Sale price ($)', line=dict(color='blue')))

# Add the line for estimated value
fig10_1.add_trace(go.Scatter(x=grouped_df['year'], y=grouped_df['estimated value ($)'], mode='lines+markers', name='Estimated Value ($)', line=dict(color='orange')))
fig10_1.update_layout(
    title='Relationship between Estimated Value and Sale Price',
    xaxis_title='Year',
    yaxis_title='Value ($)',
    legend_title='Legend'
)
fig10_1.show()


In [52]:
fig10_1.write_html(os.path.join(FIG_DIR, 'Relationship between Estimated Value and Sale Price Using Line Plot.html'))

##  How does the number of rooms in a property affect its sale price?

In [53]:
fig11=px.histogram(data_frame=df , x = df['num_rooms'].astype(str) ,title='Relationship between Number of Rooms and Sale Price', y = 'sale price ($)', histfunc='avg' , text_auto=True )
fig11.show()

In [54]:
fig11.write_html(os.path.join(FIG_DIR, 'Relationship between Number Of Rooms and Sale Price.html'))

## Analysis of the Relationship Between Face Features and Sale Prices ?

In [55]:
avg_sale_price = df.groupby('face')['sale price ($)'].mean().reset_index()
fig12 = px.bar(avg_sale_price, x='face', y='sale price ($)', text='sale price ($)')

fig12.update_layout(title='Average Sale Price per Face')

fig12.show()

In [56]:
fig12.write_html(os.path.join(FIG_DIR, 'Relationship between Face and Sale Price.html'))

## Exploring Sale Prices of Residential Properties ?

In [57]:
avg_sale_price = df.groupby('residential')['sale price ($)'].mean().reset_index()

fig13=px.histogram(df, x='residential', y='sale price ($)',
                   title='Average Sale Price by Residential Type',
                   labels={'residential': 'Residential Type', 'sale price ($)': 'Average Sale Price ($)'},
                   color='residential',text_auto=True ,
                   color_discrete_map={'House': 'blue', 'Apartment': 'green'},
                   histfunc='avg', 
                   hover_data={'sale price ($)': ':.2f'}  
                   )
fig13.show()

In [58]:
fig13.write_html(os.path.join(FIG_DIR, 'Relationship between Residential Type and Sale Price.html'))

In [59]:
df['residential'].value_counts()

residential
Detached House    6323
Duplex             772
Triplex            452
Fourplex            78
Name: count, dtype: int64

## Examining the Relationship Between Sale Prices and Year of Transaction ?

In [60]:
yearly_sale_price = df.groupby('year')['sale price ($)'].mean().reset_index()
fig14=px.line(yearly_sale_price, x='year', y='sale price ($)', title='Average Sale Price per Year',
              labels={'year': 'Year', 'sale price ($)': 'Average Sale Price ($)'})
fig14.show()

In [61]:
fig14.write_html(os.path.join(FIG_DIR, 'Relationship between Year and Sale Price Using Line Plot.html'))

In [62]:
fig15=px.histogram(data_frame=df,x=df['year'].astype(str),y='sale price ($)',title='Relation Between Sale Price and Year',color='year')
fig15.show()

In [63]:
fig15.write_html(os.path.join(FIG_DIR, 'Relationship between Year and Sale Price Using histogram.html'))

In [64]:
top_3_year=df[(df['year']== 2009) | (df['year']== 2019) | (df['year']== 2022)]
top_3_year

Unnamed: 0,year,locality,estimated value ($),sale price ($),property,residential,num_rooms,num_bathrooms,carpet_area_sq_ft,face,month_name,tax_value
0,2009,Waterbury,111440.0,185000.0,Single Family,Detached House,3,3,996.0,South,January,1898.013050
1,2009,Unknown,73080.0,152000.0,Single Family,Detached House,3,1,935.0,North,January,1559.448560
2,2009,Waterbury,50540.0,105000.0,Single Family,Detached House,3,1,951.0,East,January,1077.250650
4,2009,Bridgeport,4775276.0,272900.0,Single Family,Detached House,3,1,971.0,East,January,2799.825737
7,2009,Norwalk,172620.0,409000.0,Single Family,Detached House,3,3,1004.0,South,January,4196.147770
...,...,...,...,...,...,...,...,...,...,...,...,...
9994,2022,Unknown,216420.0,415000.0,Single Family,Detached House,3,3,1014.0,North,September,4166.512850
9995,2022,Unknown,155550.0,215000.0,Three Family,Triplex,6,5,2114.0,North,September,2158.554850
9997,2022,West Hartford,124740.0,225635.0,Single Family,Detached House,3,1,922.0,North,September,2265.328017
9998,2022,Unknown,1455230.0,2625000.0,Single Family,Detached House,3,2,977.0,North,September,26354.448750


In [65]:
fig15_1=px.histogram(data_frame=top_3_year , x = top_3_year['locality'].astype(str) ,histfunc='count' , 
             text_auto=True ).update_xaxes(categoryorder='total ascending')
fig15_1.show()

In [66]:
fig15_11=px.histogram(top_3_year, x='residential', histfunc='count' , text_auto=True).update_xaxes(categoryorder='total ascending')
fig15_11.show()

In [67]:
least_3_year=df[(df['year']== 2013) | (df['year']== 2018) | (df['year']== 2011)]
least_3_year

Unnamed: 0,year,locality,estimated value ($),sale price ($),property,residential,num_rooms,num_bathrooms,carpet_area_sq_ft,face,month_name,tax_value
1309,2011,Waterbury,143020.0,177000.0,Single Family,Detached House,3,3,1083.0,East,January,1814.30841
1311,2011,Waterbury,124550.0,11000.0,Three Family,Triplex,6,4,1918.0,East,January,112.75363
1312,2011,Fairfield,353570.0,400000.0,Single Family,Detached House,3,2,972.0,South,January,4100.13200
1313,2011,Unknown,611400.0,825000.0,Single Family,Detached House,3,3,1041.0,South,January,8456.52225
1314,2011,Unknown,720650.0,1015000.0,Single Family,Detached House,3,3,1093.0,North,January,10404.08495
...,...,...,...,...,...,...,...,...,...,...,...,...
6684,2018,Fairfield,403480.0,548000.0,Single Family,Detached House,3,1,941.0,South,December,7794.24784
6685,2018,Waterbury,102320.0,180000.0,Single Family,Detached House,3,1,995.0,North,December,2560.15440
6686,2018,Bridgeport,158420.0,180000.0,Single Family,Detached House,3,2,1047.0,North,December,2560.15440
6687,2018,Fairfield,551250.0,589000.0,Single Family,Detached House,3,2,1082.0,East,December,8377.39412


In [68]:
fig15_2=px.histogram(data_frame=least_3_year , x = least_3_year['locality'].astype(str) ,histfunc='count' , 
             text_auto=True ).update_xaxes(categoryorder='total ascending')
fig15_2.show()

In [69]:
fig15_22=px.histogram(least_3_year, x='residential', histfunc='count' , text_auto=True).update_xaxes(categoryorder='total ascending')
fig15_22.show()

- After this visual i see that top_3_year They had a lot of sales Other than least_3_year

## Analysis of Sale Prices Across Different Localities ?

In [70]:
locality_mean = df.groupby('locality')['sale price ($)'].mean().sort_values()
locality_mean_df = locality_mean.reset_index()
fig16=px.bar(locality_mean_df, x='locality', y='sale price ($)', 
             title='Sale price Mean in Each Locality',
             labels={'locality': 'Locality', 'sale price ($)': 'Mean Sale price ($)'},
             color='sale price ($)',
             color_continuous_scale='Viridis')
fig16.show()



In [71]:
fig16.write_html(os.path.join(FIG_DIR, 'Relationship between Locality and Sale Price.html'))

In [72]:
df['locality'].unique()

array(['Waterbury', 'Unknown', 'Bridgeport', 'Norwalk', 'Greenwich',
       'Fairfield', 'West Hartford', 'Stamford'], dtype=object)


- Waterbury: Known as "The Brass City," it was historically a major center for brassware manufacturing.

- Bridgeport: The largest city in Connecticut, it has a rich industrial history and is located along Long Island Sound.

- Norwalk: A coastal city that includes Norwalk Harbor and is part of the Greater New York metropolitan area.

- Greenwich: Known for its affluent neighborhoods, it is a part of the New York metropolitan area and home to many financial services firms.

- Fairfield: A town known for its excellent public school system, beaches, and suburban residential communities.

- West Hartford: An affluent suburb of Hartford, known for its high quality of life, good schools, and vibrant downtown area.

- Stamford: One of the largest cities in Connecticut, it is a significant financial district and part of the Greater New York metropolitan area.

- واتربيري: المعروفة باسم "المدينة النحاسية"، كانت تاريخياً مركزاً رئيسياً لصناعة الأواني النحاسية.

- بريدجبورت: أكبر مدينة في ولاية كونيتيكت، ولها تاريخ صناعي غني، وتقع على طول لونغ آيلاند ساوند.

- نورووك: مدينة ساحلية تضم ميناء نورووك وهي جزء من منطقة نيويورك الكبرى.

- غرينتش: تشتهر بأحيائها الثرية، وهي جزء من منطقة نيويورك الحضرية وموطن للعديد من شركات الخدمات المالية.

- فيرفيلد: مدينة معروفة بنظام المدارس العامة الممتاز، والشواطئ، والمجتمعات السكنية في الضواحي.

- ويست هارتفورد: إحدى ضواحي هارتفورد الثرية، المعروفة بجودة الحياة العالية والمدارس الجيدة ومنطقة وسط المدينة النابضة بالحياة.

- ستامفورد: إحدى أكبر المدن في ولاية كونيتيكت، وهي منطقة مالية مهمة وجزء من منطقة نيويورك الكبرى.


## Analysis of face Across Different property ?

In [73]:
df_face_property = df.groupby(['face', 'property']).size().reset_index(name='count')
fig17=px.histogram(df_face_property, x='face', y='count', color='property', text_auto=True ,
                   title='Distribution of Property Types by Face',
                   labels={'face': 'Face', 'count': 'Count', 'property': 'Property Type'},
                   barmode='group') 
fig17.show()


In [74]:
fig17.write_html(os.path.join(FIG_DIR, 'Relationship between Face and Property.html'))

- This indicates that the face orientation (East, North, South, West) does not significantly impact the type of properties in the dataset.


## Relationship between Locality and Residential Status ?

In [75]:
fig29 = px.histogram(df, x='locality', color='residential',
                   title='Relationship between Locality and Residential Status',
                   labels={'locality': 'Locality', 'residential': 'Residential Status'},
                   barmode='group')

fig29.show()

In [76]:
fig29.write_html(os.path.join(FIG_DIR, 'Relationship between Locality and Residential Status.html'))

In [77]:
# This Question multivariate analysis
avg_values = df.groupby(['locality', 'residential']).agg({'sale price ($)': 'mean'}).reset_index()
fig26=px.bar(avg_values, 
             x='locality', 
             y='sale price ($)', 
             color='residential', 
             title='Average Sale Price by Locality and Residential Type',
             labels={'sale price ($)': 'Average Sale price ($)', 'locality': 'Locality', 'residential': 'Residential Type'},
             barmode='group')
fig26.show()

## Analysis of Carpet Area by locality Type ?

In [78]:
avg_carpet_area = df.groupby('locality')['carpet_area_sq_ft'].mean().reset_index()
fig18= px.bar(avg_carpet_area, x='locality', y='carpet_area_sq_ft',
             title='Average Carpet Area by Locality',
             labels={'locality': 'Locality', 'carpet_area_sq_ft': 'Average Carpet Area (sq ft)'})
fig18.update_xaxes(categoryorder='total ascending')
fig18.show()

In [79]:
fig18.write_html(os.path.join(FIG_DIR, 'Relationship between Locality and Carpet Area.html'))

In [80]:
# This Question multivariate analysis
avg_values = df.groupby('locality').agg({'sale price ($)': 'mean', 'carpet_area_sq_ft': 'mean'}).reset_index()
fig18_1=px.bar(avg_values, 
             x='locality', 
             y='sale price ($)', 
             color='carpet_area_sq_ft',
             title='Average Sale Price and Carpet Area by Locality',
             labels={'locality': 'Locality', 'sale price ($)': 'Average Sale Price ($)', 'carpet_area_sq_ft': 'Carpet Area (sq ft)'},
             barmode='group')
fig18_1.show()

## Analysis of Carpet Area by Residential Type ?

In [81]:

carpet_res_avg = df.groupby('residential')['carpet_area_sq_ft'].mean().sort_values()

carpet_res_avg_df = carpet_res_avg.reset_index()

fig19=px.bar(carpet_res_avg_df, x='residential', y='carpet_area_sq_ft', 
             title='Average Carpet Area by Residential Type',
             labels={'residential': 'Residential Type', 'carpet_area_sq_ft': 'Average Carpet Area (sq ft)'},
             color='residential')
fig19.show()



In [82]:
fig19.write_html(os.path.join(FIG_DIR, 'Relationship between Residential and Carpet Area.html'))

## Relationship between Property Type and Residential Status

In [83]:
fig20= px.histogram(df, x='property', color='residential',
                     title='Relationship between Property Type and Residential Status',
                     labels={'property': 'Property Type', 'residential': 'Residential Status'},
                     barmode='group')  # Use grouped bars

# Show the figure
fig20.show()

In [84]:
fig20.write_html(os.path.join(FIG_DIR, 'Relationship between Property Type and Residential Status.html'))

## Relationship between Tax Value and Residential Status

In [85]:
avg_tax_value = df.groupby('residential')['tax_value'].mean().reset_index()
fig21 = px.bar(avg_tax_value, x='residential', y='tax_value', 
             title='Average Tax Value by Residential Category',
             labels={'residential': 'Residential Category', 'tax_value': 'Average Tax Value ($)'},
             text='tax_value')
fig21.show()

In [86]:
df['residential'].value_counts()

residential
Detached House    6323
Duplex             772
Triplex            452
Fourplex            78
Name: count, dtype: int64

In [87]:
fig21.write_html(os.path.join(FIG_DIR, 'Relationship between Tax Value and Residential Status.html'))

## Analysis of Tax Value by locality Type ?

In [88]:
locality_mean = outliers_df.groupby('locality')['tax_value'].mean().sort_values()
locality_mean_df = locality_mean.reset_index()
fig22=px.bar(locality_mean_df, x='locality', y='tax_value', 
             title='tax value Mean in Each Locality',
             labels={'locality': 'Locality', 'tax_value': 'Mean tax value'},
             color='tax_value',
             color_continuous_scale='Viridis')
fig22.show()

In [89]:
fig22.write_html(os.path.join(FIG_DIR, 'Relationship between Tax Value and locality Type.html'))

# Multivariate Analysis

## Correlation between number of rooms and the number of bathrooms together affect the sale price ?

In [90]:
fig24 = px.histogram(df, x=df['num_rooms'].astype(str), y='sale price ($)', color='num_bathrooms',
                     title='Number of Rooms and Bathrooms vs Sale price',
                     labels={'num_rooms': 'Number of Rooms', 'sale price ($)': 'Sale Price ($)', 'num_bathrooms': 'Number of Bathrooms'},
                     hover_data=['carpet_area_sq_ft'],
                     histfunc='avg')  # Use mean for sale price

fig24.update_layout(barmode='group')
fig24.show()


In [91]:
fig24.write_html(os.path.join(FIG_DIR, 'Relationship between Number of Rooms and Bathrooms vs Sale price.html'))

- Concentration of Sale Prices:

- The majority of the sale prices are concentrated around properties with 3 rooms and a varying number of bathrooms.
Properties with 3 rooms and 3 bathrooms contribute the highest sum of sale prices, followed by those with 1 and 2 bathrooms.
Limited Contribution from Higher Room Counts:

- Properties with more than 3 rooms (4 to 8 rooms) contribute very little to the overall sum of sale prices. This suggests that such properties are either less common or less frequently sold.
Impact of Number of Bathrooms:

- The sum of sale prices increases with the number of bathrooms for properties with 3 rooms, indicating that properties with more bathrooms tend to have higher sale prices.
For properties with more than 3 rooms, the variation in the number of bathrooms does not significantly affect the sum of sale prices, possibly due to the low frequency of such properties.
Market Demand:

- The market demand appears to be highest for properties with 3 rooms and a range of bathroom counts, particularly those with 3 bathrooms. This could indicate a preference for mid-sized properties with adequate bathroom facilities.

## Correlation between locality affect the sale price considering the carpet area? 

In [92]:
avg_values = df.groupby('locality').agg({'sale price ($)': 'mean', 'carpet_area_sq_ft': 'mean'}).reset_index()
fig25=px.bar(avg_values, 
             x='locality', 
             y='sale price ($)', 
             color='carpet_area_sq_ft',
             title='Average Sale Price and Carpet Area by Locality',
             labels={'locality': 'Locality', 'sale price ($)': 'Average Sale Price ($)', 'carpet_area_sq_ft': 'Carpet Area (sq ft)'},
             barmode='group')
fig25.show()

In [93]:
fig25.write_html(os.path.join(FIG_DIR, 'Relationship between Sale Price and Carpet Area by Locality.html'))

- High Sale Price in Greenwich:

- Dominance of Greenwich: Greenwich has the highest average sale price, significantly surpassing $2 million. This is consistent with previous insights, indicating a high demand and premium property market in this locality.
Variation in Carpet Area:

- Color Coding for Carpet Area: The color coding represents the carpet area in square feet, with darker colors indicating smaller areas and lighter colors representing larger areas.
Greenwich and Fairfield: These localities show darker colors, indicating relatively smaller carpet areas despite high sale prices, which could reflect a premium price per square foot in these areas.
Bridgeport and Waterbury: These localities have lighter colors, suggesting larger carpet areas at much lower sale prices, indicating more affordable housing options with more space.
Moderate Pricing in Fairfield, Norwalk, and Stamford:

- Fairfield: This locality has a relatively high average sale price, though significantly lower than Greenwich. The color indicates a larger carpet area compared to Greenwich, suggesting more space for the price.
- Norwalk and Stamford: These localities show moderate average sale prices with darker colors, indicating smaller carpet areas. This suggests a higher price per square foot, but still more affordable compared to Greenwich.
- Affordable Markets in Bridgeport and Waterbury:

- Bridgeport: Shows the lowest average sale price with a relatively larger carpet area, indicating very affordable housing with good space.
- Waterbury: Also displays low average sale prices with larger carpet areas, reinforcing it as an affordable market with spacious properties.
- Unknown Locality:

- Moderate Sale Price and Carpet Area: The 'Unknown' category shows moderate average sale prices and a mid-range carpet area, indicating a balanced market.
West Hartford:

- Low Pricing with Smaller Area: West Hartford shows lower average sale prices with smaller carpet areas, suggesting affordability with less space.
Conclusions:
Premium Market in Greenwich: Greenwich remains the most expensive locality with high prices despite smaller carpet areas, reflecting a premium market.
- Moderate Markets: Fairfield, Norwalk, and Stamford offer moderate prices with varying carpet areas, indicating a mix of affordability and space.
Affordable Options: Bridgeport and Waterbury provide the most affordable housing options with larger carpet areas, ideal for buyers seeking more space at a lower cost.
Balanced Market: The 'Unknown' locality shows balanced pricing and space, representing a stable market.
- Recommendations:
Investment Focus: Investors might prioritize Greenwich for premium returns but should be mindful of the smaller carpet areas. Fairfield and Stamford also present good opportunities with relatively high prices and decent space.
Affordable Housing: Buyers seeking affordable and spacious housing should focus on Bridgeport and Waterbury.


## Correlation locality and residential status together affect the sale price ?

In [94]:
avg_values = df.groupby(['locality', 'residential']).agg({'sale price ($)': 'mean'}).reset_index()
fig26=px.bar(avg_values, 
             x='locality', 
             y='sale price ($)', 
             color='residential', 
             title='Average Sale Price by Locality and Residential Type',
             labels={'sale price ($)': 'Average Sale price ($)', 'locality': 'Locality', 'residential': 'Residential Type'},
             barmode='group')
fig26.show()

In [95]:
fig26.write_html(os.path.join(FIG_DIR, 'Relationship between Sale Price by Locality and Residential Type.html'))

- Significant Price Variation in Greenwich:

- Greenwich Dominance: Greenwich stands out with significantly higher average sale prices, particularly for triplexes, which exceed $2 million. This indicates a high demand and premium pricing in this locality.
- Expensive Properties: Detached houses and fourplexes in Greenwich also command high prices, indicating that the overall property market in Greenwich is premium across different residential types.
High Prices in Fairfield and Stamford:

- Fairfield: This locality shows a higher average sale price for triplexes, closely following Greenwich, indicating it as another high-value area, especially for multi-family properties.
- Stamford: Stamford has relatively high average sale prices for both detached houses and triplexes, suggesting a robust market for these types of properties.
Moderate Pricing in Other Localities:

- Norwalk and West Hartford: These areas exhibit moderate average sale prices across all residential types, with detached houses and triplexes slightly higher. This suggests a stable market without extreme pricing.
- Bridgeport and Waterbury: These localities have lower average sale prices across all residential types, indicating more affordable markets.
Variation by Residential Type:

- Detached Houses: Generally, detached houses show moderate to high average sale prices across most localities, with significant peaks in Greenwich and Stamford.
Duplexes and Fourplexes: These types have moderate pricing across localities, with less variation compared to detached houses and triplexes.
Triplexes: Triplexes show the highest variability, with extreme highs in Greenwich and significant values in Fairfield and Stamford, suggesting that triplexes are highly valued in certain localities.
- Unknown Locality:

- Consistent Moderate Pricing: The 'Unknown' category shows moderate average sale prices for all residential types, indicating a consistent but unremarkable market.

- High Price in Greenwich:

- Greenwich Dominance: Greenwich stands out with significantly higher average sale prices, especially for triplexes, which exceed $2 million. This indicates a high demand and premium pricing in this locality.
Expensive Properties: Detached houses and fourplexes in Greenwich also command high prices, showing that the overall property market in Greenwich is premium across different residential types.
Fairfield and Stamford:

- Fairfield: This locality shows a higher average sale price for triplexes, closely following Greenwich, indicating it as another high-value area, especially for multi-family properties.
Stamford: Stamford has relatively high average sale prices for both detached houses and triplexes, suggesting a robust market for these types of properties.
Moderate Pricing in Other Localities:

- Norwalk and West Hartford: These areas exhibit moderate average sale prices across all residential types, with detached houses and triplexes slightly higher. This suggests a stable market without extreme pricing.
Bridgeport and Waterbury: These localities have lower average sale prices across all residential types, indicating more affordable markets.
Variation by Residential Type:

- Detached Houses: Generally, detached houses show moderate to high average sale prices across most localities, with significant peaks in Greenwich and Stamford.
Duplexes and Fourplexes: These types have moderate pricing across localities, with less variation compared to detached houses and triplexes.
Triplexes: Triplexes show the highest variability, with extreme highs in Greenwich and significant values in Fairfield and Stamford, suggesting that triplexes are highly valued in certain localities.
- Unknown Locality:

- Consistent Moderate Pricing: The 'Unknown' category shows moderate average sale prices for all residential types, indicating a consistent but unremarkable market.
Conclusions:
- Premium Markets: Greenwich, followed by Fairfield and Stamford, are premium markets with high average sale prices, especially for triplexes and detached houses.
Stable Markets: Norwalk and West Hartford offer stable markets with moderate pricing, suggesting steady demand and supply.
Affordable Markets: Bridgeport and Waterbury are more affordable, catering to buyers seeking lower-priced properties.

## Relationship between Sale Price, Property Type, and Residential Status

In [96]:
fig27 = px.histogram(df, x='property', y='sale price ($)', color='residential',
                   title='Relationship between Sale Price, Property Type, and Residential Status',
                   labels={'property': 'Property Type', 'sale price ($)': 'Sale Price ($)', 'residential': 'Residential Status'},
                   barmode='group', histfunc='avg')  # Use mean for sale price
fig27.show()

In [97]:
fig27.write_html(os.path.join(FIG_DIR, 'Relationship between Sale Price, Property Type, and Residential Status.html'))

## Relationship between Sale Price, Tax Value and year ?

In [98]:
grouped_df = df.groupby('year')[['sale price ($)', 'tax_value']].mean().reset_index()
fig23= go.Figure()

# Add the line for sale price
fig23.add_trace(go.Scatter(x=grouped_df['year'], y=grouped_df['sale price ($)'], mode='lines+markers', name='Sale price ($)', line=dict(color='blue')))

# Add the line for tax_value
fig23.add_trace(go.Scatter(x=grouped_df['year'], y=grouped_df['tax_value'], mode='lines+markers', name='tax_value', line=dict(color='orange')))
fig23.update_layout(
    title='Relationship between tax_value and Sale Price',
    xaxis_title='Year',
    yaxis_title='Value ($)',
    legend_title='Legend'
)
fig23.show()

In [99]:
fig23.write_html(os.path.join(FIG_DIR, 'Relationship between Sale Price and Tax Value and year.html'))