In [67]:
#Importing libraries
import pandas as pd
import numpy as np 

import seaborn as sns 
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import norm
from scipy import stats

In [68]:

data = pd.read_csv('Estate_housing.tsv', sep='\t')
display(data.iloc[:5, :5])

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage
0,1,526301100,20,RL,141.0
1,1,526301100,20,RL,141.0
2,2,526350040,20,RH,80.0
3,3,526351010,20,RL,81.0
4,4,526353030,20,RL,93.0


In [None]:
data.info() 

In [None]:
data["SalePrice"].describe() 

In [None]:
numerical_in_data = data.select_dtypes(include = ['float64', 'int64'])
numerical_in_data_corr = numerical_in_data.corr()['SalePrice']  
top_features = numerical_in_data_corr[abs(numerical_in_data_corr) > 0.5].sort_values(ascending=False) #displays pearsons correlation coefficient greater than 0.5
print(f"There is {len(top_features)} strongly correlated values with SalePrice:\n{top_features}")
#print(numerical_in_data)

In [None]:
#Visual Inspection of the data
for i in range(0, len(numerical_in_data.columns), 5):
    sns.pairplot(data=numerical_in_data,
                x_vars=numerical_in_data.columns[i:i+5],
                y_vars=['SalePrice'])

In [None]:

#Histplot for SalePrice
initial_price_plot = sns.histplot(data['SalePrice'], kde=True, bins=50) 

In [None]:
#Printing the skewness of SalePrice
print(f"Skewness of Price: {data['SalePrice'].skew()}")

In [None]:
#Log Transformation of SalePrice
log_transformed= np.log(data['SalePrice'])


In [None]:
Saleprice_plot = sns.histplot(log_transformed, kde=True)

In [None]:
#Printing skewness
print("Skewness: %f" % (log_transformed).skew())

In [None]:
#check if there are any duplicate indexes in the dataset
data.index.is_unique  

In [None]:
#Check for duplicate entries in the 'SalePrice' column
duplicate = data[data.duplicated(['SalePrice'])]
duplicate

In [None]:
#Removing Duplicates
removed = data.drop_duplicates()
removed

In [None]:
#Check for missing values
total = data.isnull().sum().sort_values(ascending=False)
total_select = total.head(20)
total_select.plot(kind="bar", figsize = (8,6), fontsize = 10)

plt.xlabel("Columns", fontsize = 20)
plt.ylabel("Count", fontsize = 20)
plt.title("Total Missing Values", fontsize = 20)

In [None]:
#drop the rows with missing values
data_dropped = data.dropna()
print(f"Number of rows after dropping missing values: {len(data_dropped)}")


In [None]:
#fill missing values with a constant
data_filled_constant = data.fillna(0)
print(f"Number of rows after filling missing values with constant: {len(data_filled_constant)}")


In [None]:
#Feature Scaling
normalized_data = MinMaxScaler().fit_transform(numerical_in_data)
normalized_data

In [None]:
#standardize the data
standardized_data = StandardScaler().fit_transform(numerical_in_data)
standardized_data


In [None]:
#Handling outliers
#use box plot for univariate analysis of price
sns.boxplot(x=data['SalePrice'])



In [None]:
#plot box plot for features above 0.5 correlation with Price
for col in top_features.index:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=data[col])
    plt.title(f'Box Plot of {col}')
    plt.show()

In [None]:
#plot box plot for all numerical columns
for col in numerical_in_data.columns:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=data[col])
    plt.title(f'Box Plot of {col}')
    plt.show()

In [None]:
#retrieve columns with Pearson correlation greater than 0.5 and print them
correlated_columns = top_features.index.tolist()
print("Columns with Pearson correlation greater than 0.5:", correlated_columns)

In [None]:
#first sort the data by 'Engine (cc)' and select the last 3
data.sort_values(by = 'Engine (cc)', ascending = False)[:3]