In [465]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [467]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [469]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [471]:
#determine total number of columns with no data in training dataset
train_data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [473]:
#determine total number of columns with no data in test dataset
test_data.isnull().sum()
test_data_ids = test_data['Id']
test_data = test_data.drop('Id', axis=1)

In [475]:
#differentiate normal from abnormal sale condition explicitly
train_data['SaleCondition'] = train_data['SaleCondition'].apply(lambda x: 1 if x == 'Normal' else 0)
test_data['SaleCondition'] = test_data['SaleCondition'].apply(lambda x: 1 if x == 'Normal' else 0)

In [477]:
#fix the missing column
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(np.mean(train_data['LotFrontage']))
test_data['LotFrontage'] = test_data['LotFrontage'].fillna(np.mean(test_data['LotFrontage']))

In [479]:
test_data.MSZoning.fillna(value='RL', axis=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data.MSZoning.fillna(value='RL', axis=0, inplace=True)


In [481]:
test_data.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         1
SaleCondition    0
Length: 79, dtype: int64

In [483]:
test_data.SaleType.fillna(value='WD', axis=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data.SaleType.fillna(value='WD', axis=0, inplace=True)


In [485]:
test_data.isnull().sum()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [487]:
# Function to drop stuff
def drop_columns(df, column):
    for col in column:
        df = df.drop(col, axis=1)
    return df
column = ['Alley', 'MasVnrType', 'PoolQC', 'FireplaceQu', 'Fence', 'MiscFeature', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageFinish', 'GarageYrBlt', 'GarageCond']
train_data = drop_columns(train_data, column)
test_data = drop_columns(test_data, column)

In [489]:
columns_with_null = train_data.columns[train_data.isnull().any()].tolist()
print(columns_with_null)

['MasVnrArea', 'Electrical', 'GarageType', 'GarageQual']


In [491]:
train_data['MasVnrArea'].value_counts()
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(np.mean(train_data['MasVnrArea']))
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(np.mean(test_data['MasVnrArea']))


In [493]:
train_data.Electrical.value_counts()
train_data.Electrical.fillna(value='SBrkr', axis=0, inplace=True)
test_data.Electrical.fillna(value='SBrkr', axis=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data.Electrical.fillna(value='SBrkr', axis=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data.Electrical.fillna(value='SBrkr', axis=0, inplace=True)


In [495]:
train_data.GarageType.value_counts()
train_data.GarageType.fillna(value='Attchd', axis=0, inplace=True)
test_data.GarageType.fillna(value='Attchd', axis=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data.GarageType.fillna(value='Attchd', axis=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data.GarageType.fillna(value='Attchd', axis=0, inplace=True)


In [497]:
train_data.GarageQual.value_counts()
train_data.GarageQual.fillna(value='TA', axis=0, inplace=True)
test_data.GarageQual.fillna(value='TA', axis=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data.GarageQual.fillna(value='TA', axis=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data.GarageQual.fillna(value='TA', axis=0, inplace=True)


In [499]:
# Lets split our data first before encoding
X = train_data.drop(['SalePrice', 'Id'], axis=1)
y = train_data['SalePrice']

In [501]:

# Function to fill all null values
def fill_nulls(df):
    for column in df.columns:
        if df[column].dtype in ['int64', 'float64']:
            # Fill numeric columns with mean
            df[column].fillna(df[column].mean(), inplace=True)
        elif df[column].dtype == 'object':
            # Fill categorical columns with the mode
            df[column].fillna(df[column].mode()[0], inplace=True)
    return df
test_data = fill_nulls(test_data)
test_data.info()
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 65 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1459 non-null   object 
 2   LotFrontage    1459 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   LotShape       1459 non-null   object 
 6   LandContour    1459 non-null   object 
 7   Utilities      1459 non-null   object 
 8   LotConfig      1459 non-null   object 
 9   LandSlope      1459 non-null   object 
 10  Neighborhood   1459 non-null   object 
 11  Condition1     1459 non-null   object 
 12  Condition2     1459 non-null   object 
 13  BldgType       1459 non-null   object 
 14  HouseStyle     1459 non-null   object 
 15  OverallQual    1459 non-null   int64  
 16  OverallCond    1459 non-null   int64  
 17  YearBuilt      1459 non-null   int64  
 18  YearRemo

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


In [503]:
# Concatenate dataframes in a row-wise manner
final_df = pd.concat([X, test_data], axis=0)

In [507]:
final_df.shape

(2919, 65)

In [547]:
# Lets encode categorical features
def encode_categorical(df): 
    """
    Encodes categorical columns within the given dataframe using one-hot encoding

    Parameters:
    - df: the dataframe to be manipulated

    Returns:
    df_encoded - dataframe with encoded categorical columns
    
    """
    # Work on a copy of the dataframe
    df = df.copy()
    
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns

    encoded_df = pd.get_dummies(df, columns=categorical_columns, drop_first=False)            
            

    return encoded_df
main_df = encode_categorical(final_df)
main_df.shape

(2919, 228)

In [557]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Columns: 228 entries, MSSubClass to SaleType_WD
dtypes: bool(192), float64(10), int64(26)
memory usage: 1.4 MB


In [559]:
# Split into equal parts
df_Train = main_df.iloc[:1460,:]
df_Test = main_df.iloc[1460:,:]

df_Test.shape

(1459, 228)

In [561]:
# Import random forest regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df_Train, y, test_size=0.33, random_state=42)

# Initialize the model
rf = RandomForestRegressor(n_estimators=200, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Predict
y_hat = rf.predict(X_test)

# Mean squared error
mse = mean_squared_error(y_test, y_hat)
print(f"MSE: {mse:.2f}")

#r2 score
r2 = r2_score(y_test, y_hat)
print(f"R2: {r2:.2f}")

MSE: 967944670.95
R2: 0.87


In [563]:
Prices = rf.predict(df_Test)

In [565]:
data = pd.DataFrame({'Id':test_data_ids.values, 
                     'Price':Prices
    
                    })

In [569]:
# Save to CSV file
data.to_csv('HousePriceSubmission.csv', index=False)