In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
df_train = pd.read_csv("train.csv")

print("First 5 rows of the dataset:")
print(df_train.head())

print("\nDataset Info:")
df_train.info()
print("\nSummary Statistics:")
print(df_train.describe())

First 5 rows of the dataset:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  Sale

In [1]:
import pandas as pd
import numpy as np

try:
    df_train = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found. Using a dummy dataframe for demonstration.")
    # Create a dummy dataframe if the file isn't found
    df_train = pd.DataFrame({
        'LotFrontage': [65.0, 80.0, 68.0, 60.0, 84.0, np.nan],
        'Alley': [np.nan, np.nan, np.nan, 'Pave', np.nan, 'Grvl'],
        'MasVnrType': ['BrkFace', 'None', 'BrkFace', 'None', 'BrkFace', np.nan],
        'MasVnrArea': [196.0, 0.0, 162.0, 0.0, 350.0, np.nan],
        'BsmtQual': ['Gd', 'Gd', 'Gd', 'TA', 'Gd', np.nan],
        'Electrical': ['SBrkr', 'SBrkr', 'SBrkr', 'SBrkr', 'SBrkr', np.nan],
    })

missing_cols = df_train.columns[df_train.isnull().any()].tolist()
print(f"--- Found {len(missing_cols)} columns with missing values ---")



numerical_missing_cols = df_train[missing_cols].select_dtypes(include=np.number).columns
categorical_missing_cols = df_train[missing_cols].select_dtypes(include='object').columns



print("\n--- Imputing Missing Values ---")


for col in numerical_missing_cols:
    median_val = df_train[col].median()
    df_train[col] = df_train[col].fillna(median_val)
    print(f"Filled numerical column '{col}' with median: {median_val}")


for col in categorical_missing_cols:
    mode_val = df_train[col].mode()[0] # .mode() returns a Series, so we take the first item
    df_train[col] = df_train[col].fillna(mode_val)
    print(f"Filled categorical column '{col}' with mode: '{mode_val}'")


print("\n" + "="*40 + "\n")
remaining_missing = df_train.isnull().sum().sum()
if remaining_missing == 0:
    print("✅ Success! All missing values have been handled.")
else:
    print(f" Warning! There are still {remaining_missing} missing values left.")

--- Found 19 columns with missing values ---

--- Imputing Missing Values ---
Filled numerical column 'LotFrontage' with median: 69.0
Filled numerical column 'MasVnrArea' with median: 0.0
Filled numerical column 'GarageYrBlt' with median: 1980.0
Filled categorical column 'Alley' with mode: 'Grvl'
Filled categorical column 'MasVnrType' with mode: 'BrkFace'
Filled categorical column 'BsmtQual' with mode: 'TA'
Filled categorical column 'BsmtCond' with mode: 'TA'
Filled categorical column 'BsmtExposure' with mode: 'No'
Filled categorical column 'BsmtFinType1' with mode: 'Unf'
Filled categorical column 'BsmtFinType2' with mode: 'Unf'
Filled categorical column 'Electrical' with mode: 'SBrkr'
Filled categorical column 'FireplaceQu' with mode: 'Gd'
Filled categorical column 'GarageType' with mode: 'Attchd'
Filled categorical column 'GarageFinish' with mode: 'Unf'
Filled categorical column 'GarageQual' with mode: 'TA'
Filled categorical column 'GarageCond' with mode: 'TA'
Filled categorical col

In [2]:
import pandas as pd

df_train = pd.DataFrame({
    'Neighborhood': ['CollgCr', 'Veenker', 'CollgCr', 'Crawfor'],
    'HouseStyle': ['2Story', '1Story', '2Story', '2Story'],
    'SalePrice': [208500, 181500, 223500, 140000]
})
print("--- Before One-Hot Encoding ---")
print("Shape of DataFrame:", df_train.shape)
print(df_train.head())
categorical_cols = df_train.select_dtypes(include='object').columns
df_train_encoded = pd.get_dummies(df_train, columns=categorical_cols, drop_first=True)
print("\n--- After One-Hot Encoding ---")
print("Shape of DataFrame:", df_train_encoded.shape)
print(df_train_encoded.head())

--- Before One-Hot Encoding ---
Shape of DataFrame: (4, 3)
  Neighborhood HouseStyle  SalePrice
0      CollgCr     2Story     208500
1      Veenker     1Story     181500
2      CollgCr     2Story     223500
3      Crawfor     2Story     140000

--- After One-Hot Encoding ---
Shape of DataFrame: (4, 4)
   SalePrice  Neighborhood_Crawfor  Neighborhood_Veenker  HouseStyle_2Story
0     208500                 False                 False               True
1     181500                 False                  True              False
2     223500                 False                 False               True
3     140000                  True                 False               True


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

num_samples = 100
df_train_encoded = pd.DataFrame({
    'OverallQual': np.random.randint(1, 11, num_samples),
    'GrLivArea': np.random.normal(1500, 400, num_samples),
    'TotalBsmtSF': np.random.normal(1000, 300, num_samples),
    'HouseAge': np.random.randint(0, 100, num_samples),
    'TotalBath': np.random.uniform(1, 4, num_samples),
    'Neighborhood_OldTown': np.random.randint(0, 2, num_samples),
    'Neighborhood_CollgCr': np.random.randint(0, 2, num_samples),
    'SalePrice': np.random.normal(180000, 50000, num_samples)
})

X = df_train_encoded.drop('SalePrice', axis=1)

y = np.log1p(df_train_encoded['SalePrice'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

print("\nTraining the Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Fit the model to the training data
rf_model.fit(X_train, y_train)
print("Model training complete.")

y_pred_log = rf_model.predict(X_test)

y_pred = np.expm1(y_pred_log)
y_test_orig = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))

print("\n--- Model Evaluation ---")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")

predictions_df = pd.DataFrame({'Actual Price': y_test_orig, 'Predicted Price': y_pred})
predictions_df['Prediction Error'] = predictions_df['Actual Price'] - predictions_df['Predicted Price']
print("\nSample Predictions:")
print(predictions_df.head())


Training set shape: (80, 7)
Testing set shape: (20, 7)

Training the Random Forest model...
Model training complete.

--- Model Evaluation ---
Root Mean Squared Error (RMSE): $43,650.23

Sample Predictions:
     Actual Price  Predicted Price  Prediction Error
83  181171.000082    167578.176084      13592.823998
53  122827.781061    139111.921062     -16284.140001
70  155119.387598     96685.639188      58433.748411
45  186026.278567    174918.641925      11107.636642
44   94116.884179    152049.408142     -57932.523964
