In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing required packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

## Reading the Data

In [None]:
train_df = pd.read_csv(r"/kaggle/input/home-data-for-ml-course/train.csv")
test_df = pd.read_csv(r"/kaggle/input/home-data-for-ml-course/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

## Calculating the missing percentage of all features

In [None]:
def missing_df(df):
    missing_df = pd.DataFrame(data = df.isnull().sum(), columns = ['count'])
    missing_df['percentage'] = missing_df['count'] / df.shape[0] * 100
    return missing_df

In [None]:
missing_train = missing_df(train_df)
max_null_train = missing_train[missing_train['percentage'] > 40].index
max_null_train

In [None]:
missing_test = missing_df(test_df)
max_null_test = missing_test[missing_test['percentage'] > 40].index
max_null_test

> Columns that contains data less that 40% are to be dropped from the dataset

In [None]:
train_df.drop(columns = max_null_train, inplace = True)
test_df.drop(columns = max_null_test, inplace = True)

In [None]:
train_df.shape, test_df.shape

In [None]:
train_null = train_df.columns[train_df.isnull().sum()>0].to_list()
test_null = test_df.columns[test_df.isnull().sum()>0].to_list()
train_null,test_null

In [None]:
missing_train.loc[train_null]

In [None]:
missing_test.loc[test_null]

In [None]:
def seperate_null_features(df, null_list):
    numerical = [col for col in null_list if df[col].dtype != 'O']
    categorical = [col for col in null_list if df[col].dtype == 'O']    
    return numerical, categorical

In [None]:
numerical_train_null, categorical_train_null = seperate_null_features(train_df, train_null)
numerical_train_null, categorical_train_null

In [None]:
numerical_test_null, categorical_test_null = seperate_null_features(test_df, test_null)
numerical_test_null, categorical_test_null

## Handling missing values

#### Handling Numerical Features

> Training Dataset

In [None]:
for feature in numerical_train_null:
    mean_value = train_df[feature].mean()
    if train_df[feature].dtype in ['int64', 'float64']:
        train_df[feature] = train_df[feature].fillna(mean_value)
    # print(train_df[feature].dtype)
    print(f"Filled null values of columns {feature} with mean value {mean_value}")

In [None]:
train_df[numerical_train_null].isnull().sum()

> Testing Dataset

In [None]:
for feature in numerical_test_null:
    mean_value = train_df[feature].mean()
    if test_df[feature].dtype in ['int64', 'float64']:
        test_df[feature] = test_df[feature].fillna(mean_value)
    # print(train_df[feature].dtype)
    print(f"Filled null values of columns {feature} with mean value {mean_value}")

In [None]:
test_df[numerical_test_null].isnull().sum()

#### Handling Categorical Features

> Training Dataset

In [None]:
for feature in categorical_train_null:
    if feature != 'Electrical':
        train_df[feature] = train_df[feature].fillna('None')
        print(f"Filled null values of columns {feature} with {'None'}")

    elif feature == 'Electrical':
        train_df['Electrical'] = train_df['Electrical'].fillna(train_df['Electrical'].mode()[0])
        print(f"Filled null values of columns {feature} with mode value {train_df[feature].mode()[0]}")

In [None]:
train_df[categorical_train_null].isnull().sum()

> Testing Dataset

In [None]:
for feature in categorical_test_null:
    if feature != 'Electrical':
        test_df[feature] = test_df[feature].fillna('None')
        print(f"Filled null values of columns {feature} with {'None'}")

    elif feature == 'Electrical':
        train_df['Electrical'] = test_df['Electrical'].fillna(train_df['Electrical'].mode()[0])
        print(f"Filled null values of columns {feature} with mode value {train_df[feature].mode()[0]}")

In [None]:
test_df[categorical_train_null].isnull().sum()

## Outlier Detection

In [None]:
skew_full = train_df.skew(numeric_only = True)
skew = skew_full[abs(skew_full) > 1]
skew_col = skew.index
skew_col

In [None]:
fig, axes = plt.subplots(10, 2, figsize = (12, skew.shape[0] * 2))
axes = axes.flatten()
for idx, col in enumerate(skew_col):
    ax = axes[idx]
    sns.boxplot(data = train_df, x = col, ax = ax)
    ax.set_title(col)
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(10,2, figsize = (12, skew.shape[0] * 2))
axes = axes.flatten()
for idx, col in enumerate(skew_col):
    ax = axes[idx]
    sns.histplot(data = train_df, x = col, bins = 42, ax = ax, kde = True)
    ax.set_title(col)
# plt.suptitle('Skewness of Features in Dataset',fontsize=16, y=0.92)
plt.tight_layout()


In [None]:
fig, axes = plt.subplots(10, 2, figsize = (12, skew.shape[0] * 2))
axes = axes.flatten()
for idx, col in enumerate(skew_col):
    ax = axes[idx]
    sns.scatterplot(data = train_df, x = col, y = 'SalePrice', ax = ax)
    ax.set_title(col)
plt.tight_layout()

In [None]:
sns.heatmap(data = train_df[skew.index].corr()[['SalePrice']], annot = True, fmt = '.1f')

In [None]:
def limit(df, feature):
    q1 = df[feature].quantile(0.25)
    q3 = df[feature].quantile(0.75)
    iqr = q3 - q1
    upper_limit = q3 + 1.5*iqr
    lower_limit = q1 - 1.5*iqr
    return upper_limit, lower_limit

In [None]:
def capping(df, col):
    upper_bound, lower_bound = limit(df, col)
    print(f'Feature: {col}\nUpper Bound: {upper_bound}\tLower Bound: {lower_bound}\n')
    df[col] = np.clip(df[col], lower_bound, upper_bound)
    

In [None]:
for i in skew_col:
    capping(train_df, i)

In [None]:
train_df[skew_col].skew()

In [None]:
fig, axes = plt.subplots(10,2, figsize = (12, skew.shape[0] * 2))
axes = axes.flatten()
for idx, col in enumerate(skew_col):
    ax = axes[idx]
    sns.boxplot(data = train_df, x = col,ax = ax)
    ax.set_title(col)
# plt.suptitle('Skewness of Features in Dataset',fontsize=16, y=0.92)
plt.tight_layout()


In [None]:
# sns.heatmap(data = train_df, x = skew_col, y = 'SalePrice')
sns.heatmap(data = train_df[skew_col].corr()[['SalePrice']], annot = True, fmt = '.1f')

## Encoding categorical variables

In [None]:
def seperate_features(df):
    numerical = [col for col in df.columns if df[col].dtype != 'O']
    categorical = [col for col in df.columns if df[col].dtype == 'O']    
    return numerical, categorical

In [None]:
categorical_col = [col for col in train_df.columns if train_df[col].dtype == 'O']
# categorical_col = train_df.select_dtypes(include = 'O').columns
categorical_col

In [None]:
train_df.describe(include = 'O').T

In [None]:
# train_df[categorical_col].nunique()
for col in categorical_col:
    print(col, ':', train_df[col].unique())

In [None]:
ordinal_col = ['LotShape',
               'LandSlope',
               'ExterQual',
               'ExterCond',
               'BsmtQual',
               'BsmtCond',
               'BsmtExposure',
               'BsmtFinType1',
               'BsmtFinType2',
               'HeatingQC',
               'GarageFinish',
               'GarageQual',
               'GarageCond',
               'PavedDrive']

In [None]:
binary_col = []
nominal_col = []

for col in set(categorical_col) - set(ordinal_col):
    unique = train_df[col].nunique()
    if unique == 2:
        binary_col.append(col)
    else:
        nominal_col.append(col)

In [None]:
assert sorted(categorical_col) == sorted(ordinal_col + binary_col + nominal_col)

In [None]:
print(f"Nominal Features: {nominal_col}\n \nBinominal Features : {binary_col} \n \nOrdinal Features: {ordinal_col}")

In [None]:
ordinal_mappings = {
    'LotShape': ['IR3', 'IR2', 'IR1', 'Reg'],  
    'LandSlope': ['Sev', 'Mod', 'Gtl'],       
    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],     
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],   
    'BsmtQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  
    'BsmtCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  
    'BsmtExposure': ['None', 'No', 'Mn', 'Av', 'Gd'], 
    'BsmtFinType1': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], 
    'BsmtFinType2': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  
    'GarageFinish': ['None', 'Unf', 'RFn', 'Fin'],  
    'GarageQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  
    'GarageCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PavedDrive': ['N', 'P', 'Y']
}

In [None]:
# binary encoding
for col in binary_col:
    label = LabelEncoder()
    train_df[col] = label.fit_transform(train_df[col])
    test_df[col] = label.fit_transform(test_df[col])

In [None]:
train_df[binary_col]

In [None]:
test_df[binary_col]

In [None]:
# ordinal encoding
ordinal = OrdinalEncoder(categories=[ordinal_mappings[col] for col in ordinal_col])
ordinal.fit(train_df[ordinal_col])
ordinal_train_data = ordinal.transform(train_df[ordinal_col])

In [None]:
ordinal_test_data = ordinal.transform(test_df[ordinal_col])

In [None]:
train_df[ordinal_col] = ordinal_train_data.astype(int)
test_df[ordinal_col] = ordinal_test_data.astype(int)
train_df[ordinal_col]

In [None]:
train_df[ordinal_col]

In [None]:
test_df[ordinal_col]

In [None]:
# nominal encoding
nominal = OneHotEncoder(sparse_output=False, handle_unknown ='ignore', drop = 'first')
nominal.fit(train_df[nominal_col])
encoded_train_nominal = nominal.transform(train_df[nominal_col])
encoded_test_nominal = nominal.transform(test_df[nominal_col])
encoded_train_nominal_df = pd.DataFrame(data = encoded_train_nominal, columns = nominal.get_feature_names_out().tolist())
encoded_test_nominal_df = pd.DataFrame(data = encoded_test_nominal, columns = nominal.get_feature_names_out().tolist())
encoded_test_nominal_df

In [None]:
nominal.feature_names_in_.tolist()

In [None]:
train_df_copy = train_df.drop(columns = nominal_col + ['Id']).copy()
test_df_copy = test_df.drop(columns = nominal_col + ['Id']).copy()
final_train_df = pd.concat([train_df_copy, encoded_train_nominal_df], axis = 1)
final_test_df = pd.concat([test_df_copy, encoded_test_nominal_df], axis = 1)
final_train_df

## Scaling

In [None]:
scaler = RobustScaler()
scaler.fit(final_train_df.drop(columns = 'SalePrice'))

In [None]:
scaler.transform(final_train_df.drop(columns = 'SalePrice'))
scaler.transform(final_test_df)

## Feature Selection

In [None]:
# Fit Random Forest regressor
model = RandomForestRegressor(n_estimators=100, random_state = 42)
model.fit(final_train_df.drop(columns = ['SalePrice'], axis = 1), final_train_df['SalePrice'])  # Assume y is the target variable

# Get feature importance
importance = model.feature_importances_

# Sort features by importance
indices = np.argsort(importance)[::-1]  # Sort in descending order
important_features = final_train_df.columns[indices]

top_n = 100
selected_important_features = important_features[:top_n]
print(f"Top {top_n} Most Important Features:")
print(selected_important_features)



In [None]:
sel_train = final_train_df[selected_important_features].copy()
sel_test = final_test_df[selected_important_features].copy()

## Model Fit

#### Splitting data into training and validation sets


In [None]:
# Splitting data into training and validation sets
X = final_train_df[selected_important_features]
y = final_train_df['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

> **Linear Regression**

In [None]:
# Create and train the Linear Regression lr
lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_lr = lr.predict(X_val)  # Predictions for validation data
mse_lr = mean_squared_error(y_val, y_val_pred_lr)
print("Mean Squared Error on Validation Set:", mse_lr)

# If you need to make predictions on test data (assuming `final_test_df` is processed correctly)
X_test = final_test_df[selected_important_features]  # Ensure the test set is preprocessed in the same way as X
y_test_pred_lr = lr.predict(X_test)
print("Test Predictions:", y_test_pred_lr[:10])  # Display first 10 predictions


> **Random Forest**

In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred_rf = rf.predict(X_val)  # Predictions for validation data
mse_rf = mean_squared_error(y_val, y_val_pred_rf)
print("Mean Squared Error on Validation Set:", mse_rf)

# If you need to make predictions on test data (assuming `final_test_df` is processed correctly)
X_test = final_test_df[selected_important_features]  # Ensure the test set is preprocessed in the same way as X
y_test_pred_rf = rf.predict(X_test)
print("Test Predictions:", y_test_pred_rf[:10])  # Display first 10 predictions


In [118]:
submission = pd.DataFrame({
    'Id': final_test_df.index,
    'SalePrice': y_test_pred_rf
})

submission.to_csv('submission.csv', index=False)
