# Importing all required libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Importing the dataset

In [2]:
df=pd.read_csv('train.csv')


# Understanding the Dataframe

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Data Cleaning
* Removing columns which has more than 50% of null values

In [4]:
df.drop(columns=['Alley','MasVnrType','FireplaceQu','PoolQC','Fence','MiscFeature',],axis=1,inplace=True)
# 'Alley', 'Fence', 'FireplaceQu', 'MasVnrType', 'MiscFeature', 'PoolQC'

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

# Converting the object columns to categorical columns
# Assigning Features and Target

In [6]:
x=df.drop('SalePrice',axis=1)
cat_cols=x.select_dtypes(include=['object']).columns
df[cat_cols]= df[cat_cols].astype('category')

x=df.drop('SalePrice',axis=1)
y=df['SalePrice']

# Spliting the data for train and test

In [7]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

# Model creation
### Enable categorical is to understand the category columns without using LabelEncoder or OnehotEncoder

In [8]:
xgb=XGBRegressor(enable_categorical=True)

# Training the model

In [9]:
xgb.fit(xtrain,ytrain)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,True


# Checking the model score

In [10]:
xgb.score(xtrain,ytrain),xgb.score(xtest,ytest)

(0.9999234080314636, 0.9014621376991272)

In [11]:
mean_absolute_error(ytest,xgb.predict(xtest))

17089.32421875

### By the Accuracy score and mean_absolute_error we can conclude that our model has trained well and it is good fit 
### As the score is good we can predict it with new data

# Importing test data for prediction

In [12]:
test=pd.read_csv('test.csv')

# Converting object type columns to categoical

In [13]:
cat_colst=test.select_dtypes(include=['object']).columns
test[cat_colst]=test[cat_colst].astype('category')

### Removing the columns we removed in train data on basis of null values

In [14]:
test.drop(columns=['Alley', 'Fence', 'FireplaceQu', 'MasVnrType', 'MiscFeature', 'PoolQC'],axis=1,inplace=True)

# Creating a new SalesPrice column to store our predicted values for data

In [15]:
test['SalesPrice']=xgb.predict(test)

In [16]:
res=test[['Id','SalesPrice']]
res.to_csv('predicted.csv')

# Converting Id and SalePrice to a dataframe and converting that to a csv file