# Import important libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

#Load the training data

In [None]:
trainData = pd.read_csv('/content/train.csv')

trainData.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


#Preprocessing data

#Separate features and target value

In [None]:
x = trainData.drop(columns=['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath'])
y = trainData['SalePrice']

#Identify numerical and categories columns

In [None]:
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

#Define preprocessing pipelines

In [None]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ])

# Apply the preprocessing steps to the dataset

In [None]:
X_preprocessed = preprocessor.fit_transform(X)

# Split the preprocessed data into training and validation sets

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size = 0.2, random_state = 42)

# Welcome AI

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()

model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

# Evaluate the model

In [None]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_val = mean_squared_error(y_val, y_pred_val)

r2 = r2_score(y_val, y_pred_val)
print(r2)

0.8861074935016936


#Test the AI model

#Load the input data from the file

In [None]:
testData = pd.read_csv('/content/test.csv')

testData.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [None]:
z = testData.drop(columns=['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath'])

num_features = z.select_dtypes(include=[np.number]).columns.tolist()
cat_features = z.select_dtypes(include=['object']).columns.tolist()

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ])

z_preprocessed = preprocessor.fit_transform(z)

#Make predictions using the trained model

In [None]:
for col in cat_features:
    train_unique = set(trainData[col].unique())
    test_unique = set(testData[col].unique())

    diff_train = train_unique - test_unique
    diff_test = test_unique - train_unique

preprocessor.fit(trainData)

z_preprocessed = preprocessor.transform(z)
train_data_preprocessed = preprocessor.transform(trainData)

model.fit(train_data_preprocessed, trainData['SalePrice'])

testData['SalePrice'] = model.predict(z_preprocessed)

#Save the predictions into a new file

In [None]:
testData.to_excel('/content/output_file.xlsx', index = False)