In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.shape

(1460, 81)

In [5]:
features_with_nullvalues=[features for features in df.columns if df[features].isnull().sum()>=1] 
#defining a variable which holds all the features which have at least one null value

In [6]:
numerical_features=[feature for feature in df.columns if df[feature].dtypes!='O']

In [7]:
year_feature = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]

year_feature

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

In [8]:
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25 and feature not in year_feature+['Id']]

In [9]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature+year_feature+['Id']]

In [10]:
dataset=df

In [11]:
## First lets handle Categorical features which are missing
features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>=1 and dataset[feature].dtypes=='O']

In [12]:
def replace(dataset,array):
    data=dataset.copy()
    data[array]=data[array].fillna('Missing')
    return data
dataset=replace(dataset,features_nan)
dataset[features_nan].isnull().sum()
#converted the nan values with a new placeholder

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [13]:
numerical_with_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>=1 and dataset[feature].dtypes!='O']

In [14]:
for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=dataset[feature].median()
    
    ## create a new feature to capture nan values
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)
    
dataset[numerical_with_nan].isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[feature].fillna(median_value,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[feature].fillna(median_value,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [15]:
## Temporal Variables (Date Time Variables)

for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
       
    dataset[feature]=dataset['YrSold']-dataset[feature]

In [16]:
data=df.copy()
categorical_features=[feature for feature in df.columns if data[feature].dtypes=='O']
categorical_features

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [17]:
for feature in categorical_features:
    labels_ordered=dataset.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    dataset[feature]=dataset[feature].map(labels_ordered)

In [18]:
import numpy as np
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']
#performing log normal distribution
for feature in num_features:
    dataset[feature]=np.log(dataset[feature])

In [19]:
dataset.shape

(1460, 84)

In [20]:
scaling_feature=[feature for feature in dataset.columns if feature not in ['Id','SalePerice'] ]
len(scaling_feature)
#why 83 not 82 dont know!!

83

In [21]:
feature_scale=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale])

In [22]:
scaler.transform(dataset[feature_scale])

array([[0.23529412, 0.75      , 0.41820812, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.49506375, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.75      , 0.434909  , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29411765, 0.75      , 0.42385922, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.434909  , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.47117546, ..., 0.        , 0.        ,
        0.        ]])

In [23]:
# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([dataset[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)

In [24]:
data.to_csv('test.csv',index=False)

In [25]:
import pandas as pd

# Load train and test datasets
train_data = pd.read_csv('X_train.csv')
test_data = pd.read_csv('test.csv')

In [26]:
## Capture the dependent feature
y_train=train_data[['SalePrice']]

In [27]:
## drop dependent feature from dataset
X_train=train_data.drop(['Id','SalePrice'],axis=1)

In [28]:
x_test=test_data.drop(['Id','SalePrice'],axis=1)

In [29]:
## Capture the dependent feature
y_test=test_data[['SalePrice']]

In [30]:
from sklearn.feature_selection import mutual_info_regression

mi_scores = mutual_info_regression(X_train, y_train, random_state=0)
mi_scores = pd.Series(mi_scores, index=X_train.columns).sort_values(ascending=False)
print(mi_scores)

  y = column_or_1d(y, warn=True)


OverallQual     0.567455
Neighborhood    0.493416
GrLivArea       0.479443
GarageArea      0.366313
GarageCars      0.362555
                  ...   
Condition2      0.000564
PoolQC          0.000000
MoSold          0.000000
PoolArea        0.000000
Utilities       0.000000
Length: 82, dtype: float64


In [31]:
print(mi_scores.mean)

<bound method Series.mean of OverallQual     0.567455
Neighborhood    0.493416
GrLivArea       0.479443
GarageArea      0.366313
GarageCars      0.362555
                  ...   
Condition2      0.000564
PoolQC          0.000000
MoSold          0.000000
PoolArea        0.000000
Utilities       0.000000
Length: 82, dtype: float64>


In [83]:
selected_features = mi_scores[mi_scores > 0.05].index  # Adjust threshold as needed
X_train_selected = X_train[selected_features]
X_test_selected = x_test[selected_features]

In [85]:
from sklearn.linear_model import LinearRegression

In [87]:
model = LinearRegression()
model.fit(X_train_selected, y_train)

In [89]:
if y_test is not None:
    y_pred = model.predict(X_test_selected)
    from sklearn.metrics import r2_score, mean_squared_error
    print(f"R^2 Score: {r2_score(y_test, y_pred)}")
    print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
else:
    print("Test data does not have a target column for evaluation.")


R^2 Score: 0.9012413156381301
Mean Squared Error: 0.015747319726229794


### This is the prediction after feature selection

In [92]:
import joblib

# Save the model
joblib.dump(model, 'house_price_model.joblib')

# Load the model
loaded_model = joblib.load('house_price_model.joblib')


In [96]:
X_train_selected


Unnamed: 0,OverallQual,Neighborhood,GrLivArea,GarageArea,GarageCars,TotalBsmtSF,BsmtQual,KitchenQual,YearBuilt,ExterQual,...,LotShape,SaleCondition,HouseStyle,BedroomAbvGr,GarageCond,BsmtExposure,CentralAir,GarageQual,Electrical,GarageYrBltnan
0,0.666667,0.636364,0.577712,0.386460,0.50,0.140098,0.75,0.666667,0.036765,0.666667,...,0.000000,0.75,1.0,0.375,1.0,0.25,1.0,0.666667,1.000000,0.0
1,0.555556,0.500000,0.470245,0.324401,0.50,0.206547,0.75,0.333333,0.227941,0.333333,...,0.000000,0.75,0.6,0.375,1.0,1.00,1.0,0.666667,1.000000,0.0
2,0.666667,0.636364,0.593095,0.428773,0.50,0.150573,0.75,0.666667,0.051471,0.666667,...,0.333333,0.75,1.0,0.375,1.0,0.50,1.0,0.666667,1.000000,0.0
3,0.666667,0.727273,0.579157,0.452750,0.75,0.123732,0.50,0.666667,0.669118,0.333333,...,0.333333,0.00,1.0,0.375,1.0,0.25,1.0,0.666667,1.000000,0.0
4,0.777778,1.000000,0.666523,0.589563,0.75,0.187398,0.75,0.666667,0.058824,0.666667,...,0.333333,0.75,1.0,0.500,1.0,0.75,1.0,0.666667,1.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.555556,0.590909,0.564433,0.324401,0.50,0.155974,0.75,0.333333,0.058824,0.333333,...,0.000000,0.75,1.0,0.375,1.0,0.25,1.0,0.666667,1.000000,0.0
1456,0.555556,0.545455,0.645810,0.352609,0.50,0.252373,0.75,0.333333,0.235294,0.333333,...,0.000000,0.75,0.6,0.375,1.0,0.25,1.0,0.666667,1.000000,0.0
1457,0.666667,0.727273,0.688669,0.177715,0.25,0.188543,0.50,0.666667,0.507353,1.000000,...,0.000000,0.75,1.0,0.500,1.0,0.25,1.0,0.666667,1.000000,0.0
1458,0.444444,0.363636,0.414497,0.169252,0.25,0.176432,0.50,0.666667,0.441176,0.333333,...,0.000000,0.75,0.6,0.250,1.0,0.50,1.0,0.666667,0.666667,0.0
