In [3]:
import zipfile
import os

# Define the path to the zip file and the extraction directory
zip_file_path = r'C:\Users\Mantosh\Downloads\house-prices-advanced-regression-techniques.zip'
extraction_dir = '/mnt/data/house-prices-data/'

# Create the extraction directory if it doesn't exist
os.makedirs(extraction_dir, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

# List the files in the extraction directory
extracted_files = os.listdir(extraction_dir)
extracted_files


['data_description.txt', 'sample_submission.csv', 'test.csv', 'train.csv']

In [5]:
# Load the extracted CSV files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
train_data_path = os.path.join(extraction_dir, 'train.csv')
test_data_path = os.path.join(extraction_dir, 'test.csv')

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Display the first few rows of the training data
train_data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
# Handling Missing Values
train_data['LotFrontage'].fillna(train_data['LotFrontage'].mean(), inplace=True)
train_data['MasVnrArea'].fillna(0, inplace=True)
train_data['GarageYrBlt'].fillna(train_data['GarageYrBlt'].mode()[0], inplace=True)

for column in ['MasVnrType', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    train_data[column].fillna(train_data[column].mode()[0], inplace=True)

# Drop columns if they exist
columns_to_drop = ['Alley', 'PoolQC', 'Fence', 'MiscFeature']
train_data.drop(columns=[col for col in columns_to_drop if col in train_data.columns], inplace=True)

# Identifying categorical variables
categorical_cols = train_data.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

# Encoding categorical variables using one-hot encoding
train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)

# Log transformation of skewed numerical features
train_data['GrLivArea'] = np.log1p(train_data['GrLivArea'])
train_data['SalePrice'] = np.log1p(train_data['SalePrice'])  # Also log-transform the target variable

# Feature scaling for numerical features
from sklearn.preprocessing import StandardScaler

num_features = ['LotFrontage', 'LotArea', 'GrLivArea', 'MasVnrArea', 'GarageYrBlt']
scaler = StandardScaler()
train_data[num_features] = scaler.fit_transform(train_data[num_features])


Categorical columns: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Correlation matrix
corr_matrix = train_data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f")
plt.show()

# Select features with high correlation with SalePrice
correlation_threshold = 0.5
high_corr_features = corr_matrix.index[abs(corr_matrix['SalePrice']) > correlation_threshold].tolist()
high_corr_features.remove('SalePrice')
print("Highly correlated features:", high_corr_features)


In [None]:
from sklearn.model_selection import train_test_split

# Define the feature matrix (using selected features) and target variable
X = train_data[high_corr_features]
y = train_data['SalePrice']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse}')
