In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score



In [18]:
train_data = pd.read_csv(r"C:\Users\mubee\Downloads\Gen ai task 2\train.csv")  
test_data = pd.read_csv(r"C:\Users\mubee\Downloads\Gen ai task 2\test.csv")    

# Display the first five rows of the training data
print(train_data.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [19]:
# Check for missing values
print(train_data.isnull().sum())
print(test_data.isnull().sum())


Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64
Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64


In [20]:
#data preprocessing: handling missing values
columns_with_na_train_dropped = train_data.dropna(axis=1)
columns_with_na_test_dropped= test_data.dropna(axis=1)
columns_with_na_train_dropped.head()
columns_with_na_test_dropped.head()

Unnamed: 0,Id,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition
0,1461,20,11622,Pave,Reg,Lvl,Inside,Gtl,NAmes,Feedr,...,140,0,0,0,120,0,0,6,2010,Normal
1,1462,20,14267,Pave,IR1,Lvl,Corner,Gtl,NAmes,Norm,...,393,36,0,0,0,0,12500,6,2010,Normal
2,1463,60,13830,Pave,IR1,Lvl,Inside,Gtl,Gilbert,Norm,...,212,34,0,0,0,0,0,3,2010,Normal
3,1464,60,9978,Pave,IR1,Lvl,Inside,Gtl,Gilbert,Norm,...,360,36,0,0,0,0,0,6,2010,Normal
4,1465,120,5005,Pave,IR1,HLS,Inside,Gtl,StoneBr,Norm,...,0,82,0,0,144,0,0,1,2010,Normal


In [21]:
# encode categorical variables
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Align train and test data 
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)


In [22]:
# Display the columns of the train_data DataFrame
print(train_data.columns)


X = train_data.drop('SalePrice', axis=1)  # Adjust to the actual target column name
y = train_data['SalePrice']  # Use the actual target column name here


Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=246)


In [23]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Create an imputer for numerical data
imputer = SimpleImputer(strategy='mean') 

# Fit the imputer on the training data and transform both training and validation data
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)


X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_val_imputed = pd.DataFrame(X_val_imputed, columns=X_val.columns)

# Initialize the models 
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regressor': SVR()
}

# Check if imputation worked
print(X_train_imputed.isnull().sum().sum())  # Should print 0
print(X_val_imputed.isnull().sum().sum())    # Should print 0


# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_imputed, y_train)  # Or use the dropped version if applicable
    y_pred = model.predict(X_val_imputed)  # Or use the dropped version if applicable

    # Evaluate the model
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    print(f"{name}:\nMean Squared Error: {mse:.2f}, R^2 Score: {r2:.2f}\n")


0
0
Linear Regression:
Mean Squared Error: 2429160460.80, R^2 Score: 0.68

Decision Tree:
Mean Squared Error: 1772419612.90, R^2 Score: 0.77

Random Forest:
Mean Squared Error: 849467529.54, R^2 Score: 0.89

Support Vector Regressor:
Mean Squared Error: 7859366571.61, R^2 Score: -0.02

