In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import joblib



In [17]:

# Step 1: Data Preparation
data = pd.read_excel("OlxDanedowgrania_2.xlsx")  # Replace "your_dataset.csv" with the actual file name


In [18]:
# Remove rows with string characters in 'floor', 'rooms', or 'area' columns

data.shape
data = data[~data['floor'].apply(lambda x: isinstance(x, str))]
data = data[~data['rooms'].apply(lambda x: isinstance(x, str))]
data = data[~data['area'].apply(lambda x: isinstance(x, str))]
data = data[~data['price'].apply(lambda x: isinstance(x, str))]

# #Reset the row index
data = data.reset_index(drop=True)
data.shape





(62635, 20)

In [19]:
data= data.drop(['Unnamed: 16',  'Unnamed: 17' , 'Unnamed: 18', 'Unnamed: 19' ], axis=1)
data=data.drop([ 'price_per_meter','offer_title','month' , 'year' ,'population'   ,'longitude','latitude'],axis=1)

In [20]:
data=data.drop_duplicates()
data=data.dropna()



In [21]:
#Pre Processing
data['rooms'] = data['rooms'].astype(int)
data['floor'] = data['floor'].astype(int)
data['price'] = data['price'].astype(int)
data['area'] = data['area'].astype(float)


data.head()

Unnamed: 0,price,offer_type,floor,area,rooms,offer_type_of_building,market,city_name,voivodeship
0,240000,Private,1,27.0,1,Housing Block,primary,Bolesławiec,Lower Silesia
1,250000,Private,1,35.0,1,Housing Block,primary,Jelcz-Laskowice,Lower Silesia
2,259000,Estate Agency,2,25.0,1,Housing Block,primary,Wrocław,Lower Silesia
3,269000,Private,3,26.18,1,Apartment Building,primary,Wrocław,Lower Silesia
4,258000,Estate Agency,3,26.0,1,Housing Block,primary,Wrocław,Lower Silesia


In [22]:
# Define a function to calculate IQR and detect outliers
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    outliers = df[(df[column] > upper_bound) | (df[column] < lower_bound)]
    return outliers

In [23]:
# Detect outliers in 'mileage'
price_outliers = detect_outliers(data, 'price')
print(f"Number of outliers in 'price': {len(price_outliers)}")

Number of outliers in 'price': 2787


In [24]:
data.isnull().sum()

price                     0
offer_type                0
floor                     0
area                      0
rooms                     0
offer_type_of_building    0
market                    0
city_name                 0
voivodeship               0
dtype: int64

In [26]:
import seaborn as sns
import matplotlib as plt
correlation = data.corr()['price'].sort_values(ascending = False).to_frame().head(10)

cmap = sns.light_palette("cyan", as_cmap = True)

s = correlation.style.background_gradient(cmap = cmap)

ValueError: could not convert string to float: 'Private'

In [None]:
# Remove outliers from the dataset
data = data[~data.index.isin(price_outliers.index)]

In [None]:


# Normalize numerical columns
numerical_cols = ['rooms', 'floor','area']
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
joblib.dump(scaler, "houses_stdscaler.joblib")


['houses_stdscaler.joblib']

In [None]:
# Convert categorical columns to string type
categorical_cols = ['offer_type', 'offer_type_of_building', 'city_name', 'voivodeship','market']

# Step 3: Feature Selection and Encoding
X = data[['floor', 'area', 'rooms']+categorical_cols ]
y = data['price']

# Perform one-hot encoding for categorical variables except 'market'
encoder = OneHotEncoder(sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))
X_encoded.columns = encoder.get_feature_names_out(categorical_cols)

# Reset the indices of X and X_encoded
X = X.reset_index(drop=True)
X_encoded = X_encoded.reset_index(drop=True)

# Concatenate X and X_encoded
X_encoded = pd.concat([X.drop(categorical_cols, axis=1), X_encoded], axis=1)

print(X_encoded)
# Save the encoder
joblib.dump(encoder, "houses_encoder.joblib")


# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)




          floor      area     rooms  offer_type_Estate Agency  \
0     -0.650437 -0.020178 -1.823066                       0.0   
1     -0.650437 -0.017477 -1.823066                       0.0   
2     -0.182783 -0.020854 -1.823066                       1.0   
3      0.284870 -0.020455 -1.823066                       0.0   
4      0.284870 -0.020516 -1.823066                       1.0   
...         ...       ...       ...                       ...   
55983 -0.650437 -0.010048  1.891986                       0.0   
55984  0.752524 -0.006333  1.891986                       0.0   
55985 -1.118090  0.011902  1.891986                       0.0   
55986  0.752524 -0.002281  1.891986                       0.0   
55987  0.284870  0.006128  1.891986                       0.0   

       offer_type_Private  offer_type_of_building_Apartment Building  \
0                     1.0                                        0.0   
1                     1.0                                        0.0   
2  

In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import numpy as np
# Step 6: Model Selection
model = GradientBoostingRegressor(random_state=42)

# Step 7: Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5]
}

# Step 7: Define the hyperparameter distributions
param_distributions = {
    'n_estimators': np.arange(100, 501, 100),
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5]
}

# Step 8: Perform random search
random_search = RandomizedSearchCV(model, param_distributions, cv=5, n_iter=10, random_state=42)
random_search.fit(X_train, y_train)

# Step 9: Retrieve the best model
best_model = random_search.best_estimator_

# Step 10: Model Evaluation
y_pred = best_model.predict(X_test)


print("Best Parameters:", random_search.best_params_)
print("Testing Score:", best_model.score(X_test, y_test))

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error:", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Best Parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.5}
Testing Score: 0.7084844874571001
Root Mean Squared Error: 76234.81954410978
Mean Absolute Error: 54464.53044819664


In [None]:
# Save the encoder
joblib.dump(best_model, "houses_gb_model.joblib")
# model = GradientBoostingRegressor(learning_rate= 0.5, max_depth= 5, n_estimators= 300,random_state=42)
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# # Calculate RMSE
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print("Root Mean Squared Error:", rmse)

# # Calculate MAE
# mae = mean_absolute_error(y_test, y_pred)
# print("Mean Absolute Error:", mae)


['houses_gb_model.joblib']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import joblib

#Prediction on a sample data object
# Convert categorical columns to string type
categorical_cols = ['offer_type', 'offer_type_of_building', 'city_name', 'voivodeship','market']

#Using inference on given data
new_data = pd.DataFrame({
    'offer_type':['Private'], 
    'offer_type_of_building':['Housing Block'],
    'city_name':['Bolesławiec'],
    'voivodeship':['Lower Silesia'],
    'market':['primary'],
    'floor':[1],
    'area':[27], 
    'rooms':[1]
})

scaler = joblib.load('houses_stdscaler.joblib')
new_data_scaled = scaler.fit_transform(new_data[['rooms', 'floor','area']]) # Apply to numerical columns
new_data_scaled = pd.DataFrame(new_data_scaled, columns=['floor', 'area', 'rooms'])

df1=pd.get_dummies(new_data, columns=categorical_cols)
# print(df1)
# Load the saved OneHotEncoder
transformer=joblib.load("houses_encoder.joblib")

# Apply the encoder to the new data
new_data_encoded = pd.DataFrame(encoder.transform(new_data[categorical_cols]))
new_data_encoded = pd.DataFrame(new_data_encoded, columns=encoder.get_feature_names_out(categorical_cols))


# Replace values in new_data_encoded with values from df1
new_data_encoded.update(df1)


# Concatenate the encoded categorical variables with the scaled numerical columns
final_data = pd.concat([new_data_scaled, new_data_encoded], axis=1)


# Replace NaN values with 0
final_data = final_data.fillna(0)

model=joblib.load('houses_gb_model.joblib')

#The predicition
predictions=model.predict(final_data)

print(predictions)

NameError: name 'encoder' is not defined