In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
df = pd.read_csv("data.csv")
df.head()

In [None]:
df.tail()

In [None]:
print(f"Number of rows {df.shape[0]}, Number of columns {df.shape[1]}")
print(df.shape)

In [None]:
print("-- Attributes in Data --")
for cols in df.columns:
    print(cols)

In [None]:
df.count()

In [None]:
df['city'].unique()

In [None]:
print("-- Number of Unique Values in Data --")
print(df.nunique())

In [None]:
print("-- Number of Null Values in Data --")
print(df.isnull().sum())

In [None]:

print("-- Details of Data --")
df.describe()

In [None]:
print("-- Insights of Data --")
df.info()

In [None]:
print("-- Number of Null Values in Data --")
print(df.isnull().sum())

In [None]:
def fillNaObjMode(col):
    for i in col:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['street', 'city', 'statezip', 'country']
fillNaObjMode(columns)

In [None]:
def fillNaMean(col):
    for i in col:
        df[i] = df[i].fillna(df[i].mean())

columns = ['price','sqft_living','sqft_lot','sqft_above','sqft_basement']
fillNaMean(columns)

In [None]:
def fillNaMode(col):
    for i in col:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['bedrooms','bathrooms','floors','waterfront','view','yr_built']
fillNaMode(columns)

In [None]:
df.drop('date', axis=1, inplace=True)
df.drop('street', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
print(df.isnull().sum())

In [None]:
def changetoint64(col):
    for i in col:
        df[i] = df[i].astype('int64')
        
columns =[
    'price','bedrooms','bathrooms','sqft_living','sqft_lot',
    'floors','waterfront','view','sqft_above','sqft_basement',
    'yr_built']
changetoint64(columns)

In [None]:
print("-- Insights of Data --")
df.info()

In [None]:
df['city'].unique()

In [None]:
df['statezip'].unique()

In [None]:
df['country'].unique()

In [None]:
df_encoded = df.copy()

In [None]:
def encodeCols(cols):
    for i in cols:
        data = pd.DataFrame({i:df[i].unique()})
        data_label_encoder = LabelEncoder()
        data_label_encoder.fit(np.ravel(data))
        df_encoded[i] = data_label_encoder.transform(df[i]) 

columns = ['city','statezip','country']
encodeCols(columns)

In [None]:
# Labels
country = pd.DataFrame({'country':df['country'].unique()})

# Initializing Label Encoders
country_label_encoder = LabelEncoder()

# Training Label Encoder
country_label_encoder.fit(np.ravel(country))

In [None]:
df_encoded['country'] = country_label_encoder.transform(df['country']) 

In [None]:
# All the Attributes are Numerical 
df_encoded.info()

In [None]:
df_encoded.to_csv(r'encoded-data.csv', index = False, header = True)

In [None]:
traindata, testdata = train_test_split(df_encoded, test_size=0.2, shuffle=False)

In [None]:
traindata.head(2)

In [None]:
testdata.head(2)

In [None]:
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

In [None]:
train_x = traindata.iloc[:, 1:]
train_x.head()

In [None]:
train_y = traindata.iloc[:, 0]
train_y.head()

In [None]:
test_x = testdata.iloc[:, 1:]
test_x.head()

In [None]:
test_y = testdata.iloc[:, 0]
test_y.head()

In [None]:
# Splitting data with shuffle for randomness
train_data, test_data = train_test_split(df_encoded, test_size=0.2, random_state=42, shuffle=True)

# Preparing training and testing datasets
train_x = train_data.drop('price', axis=1)  # Features
train_y = train_data['price']              # Target variable
test_x = test_data.drop('price', axis=1)
test_y = test_data['price']

# Initialize and train Gradient Boosting Regressor
model_gbr = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
model_gbr.fit(train_x, train_y)

# Predictions on test data
predictions = model_gbr.predict(test_x)

# Evaluate the model
mae = mean_absolute_error(test_y, predictions)
mse = mean_squared_error(test_y, predictions)
r2 = r2_score(test_y, predictions)

print("Gradient Boosting Regressor Model Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")
