## IMPORT LIBRARIES

In [2]:
# Important Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import timeit

# Encoders
from category_encoders import BinaryEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Scalers
from sklearn.preprocessing import RobustScaler


# Models 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor

# Traning 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate 

## READ DATA

In [3]:
df=pd.read_csv('traning.csv')
df

Unnamed: 0,CondoProject,District,Stories,Year_Built,Nr_of_rms,Fin_sqft,Units,Bdrms,Fbath,Hbath,Lotsize,Sale_price,Usage,Category
0,0,6,2.0,1880,0,1840.0,1,0,0,0,12750.0,15900,Commercial,Institutional
1,0,3,2.0,1876,0,6377.0,1,5,3,1,11840.0,850000,Commercial,mansion
2,0,10,1.0,1954,0,5022.0,1,0,0,0,9700.0,119000,Commercial,Institutional
3,0,1,2.0,1955,0,6420.0,1,0,0,0,8792.0,210000,Commercial,Retail
4,0,15,2.0,1909,0,5956.0,1,0,0,0,4840.0,48500,Commercial,Retail
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34195,0,6,1.0,1949,0,4800.0,1,0,0,0,9000.0,165000,Commercial,Storage
34196,0,1,0.0,1953,0,7800.0,1,0,0,0,7800.0,3900,Commercial,Land
34197,0,13,0.0,1953,0,152286.0,1,0,0,0,152286.0,100000,Commercial,Land
34198,0,5,0.0,1953,0,8925.0,1,0,0,0,8925.0,362400,Residential,Land


## MODELS

In [12]:
#splitting x , y data
x=df.drop('Sale_price',axis=1)
y=df['Sale_price']

#Encoding categorical columns
Encoder = ColumnTransformer(
    transformers=[
        ('Encoder_onehot', OneHotEncoder(drop='first'), ['Usage']),
        ('Encoder_bin', BinaryEncoder(), ['Category'])
    ],
    remainder='passthrough'
)

#adding models to test
models=[]
models.append(('LR',LinearRegression()))
models.append(('KNN',KNeighborsRegressor()))
models.append(('DECT',DecisionTreeRegressor()))
models.append(('RF',RandomForestRegressor()))
models.append(("XG" , XGBRegressor()))

# chossing scaler
scalers=("RobustScaler",RobustScaler())

# model tester loop
for model in models:   
    start= timeit.default_timer()
    steps= list()
    steps.append(("encoder",Encoder))
    steps.append(scalers)
    steps.append(model)
    # pipe line and cross_validate 
    pipeline= Pipeline(steps=steps)
    scores= cross_validate(pipeline, x, y, cv= 5, scoring= ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"],
                           return_train_score=True)
    print(model[0])
    print("-"*25)
    print("Train R2:", scores["train_r2"].mean())
    print("Test R2:", scores["test_r2"].mean())
    print("-"*25)
    print("Train MSE:", -scores["train_neg_mean_squared_error"].mean())
    print("Test MSE:", -scores["test_neg_mean_squared_error"].mean())
    print("-"*25)
    print("Train MAE:", -scores["train_neg_mean_absolute_error"].mean())
    print("Test MAE:", -scores["test_neg_mean_absolute_error"].mean())
    print("-"*25)
    stop= timeit.default_timer()
    print("Run Time", stop - start)
    print("="*50)
    
    

LR
-------------------------
Train R2: 0.33907679772719773
Test R2: 0.2817842944786813
-------------------------
Train MSE: 22179529440.286377
Test MSE: 24092708813.148293
-------------------------
Train MAE: 79459.53418284064
Test MAE: 82119.98467191207
-------------------------
Run Time 0.3394392002373934
KNN
-------------------------
Train R2: 0.7846113700188087
Test R2: 0.6538703946899089
-------------------------
Train MSE: 7230935855.103651
Test MSE: 12082981507.86554
-------------------------
Train MAE: 38905.496372807014
Test MAE: 50647.644602339176
-------------------------
Run Time 3.086574399843812
DECT
-------------------------
Train R2: 0.9803440733727516
Test R2: 0.5822757975194348
-------------------------
Train MSE: 666248395.7052855
Test MSE: 15230747660.365042
-------------------------
Train MAE: 6407.6384256833935
Test MAE: 52968.26114976154
-------------------------
Run Time 0.8922688001766801
RF
-------------------------
Train R2: 0.9529284871711795
Test R2: 0.7497

### NOTE : KNN , RF, XGBRegressor is the most effective Algoritmes

## OPTIMIZATION ALGORITHMS

### KNN OPTIMIZER 

In [13]:
#splitting x , y data
x=df.drop('Sale_price',axis=1)
y=df['Sale_price']

#Encoding categorical columns
Encoder = ColumnTransformer(
    transformers=[
        ('Encoder_onehot', OneHotEncoder(drop='first'), ['Usage']),
        ('Encoder_bin', BinaryEncoder(), ['Category'])
    ],
    remainder='passthrough'
)

# lists to store resultes of hyperparameter changing 
K_number=list()
train_score=list()
test_score=list()

# for loop for each hyperparameter to test differance outcomes
for K in range(1,30,2):
    start= timeit.default_timer()
    steps= list()
    steps.append(("encoder",Encoder))
    steps.append(("RobustScaler",RobustScaler()))
    steps.append(('KNN',KNeighborsRegressor(n_neighbors=K)))
    # pipe line and cross_validate 
    pipeline= Pipeline(steps=steps)
    scores= cross_validate(pipeline, x, y, cv= 5, scoring= ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"],
                           return_train_score=True)

    K_number.append(K)
    train_score.append(scores["train_r2"].mean())
    test_score.append(scores["test_r2"].mean())
    print(f'KNN K_number={K}')
    print("-"*25)
    print("Train R2:", scores["train_r2"].mean())
    print("Test R2:", scores["test_r2"].mean())
    print("-"*25)
    print("Train MSE:", -scores["train_neg_mean_squared_error"].mean())
    print("Test MSE:", -scores["test_neg_mean_squared_error"].mean())
    print("-"*25)
    print("Train MAE:", -scores["train_neg_mean_absolute_error"].mean())
    print("Test MAE:", -scores["test_neg_mean_absolute_error"].mean())
    print("-"*25)
    print("Run Time", stop - start)
    print("="*50)
    stop= timeit.default_timer()

KNN K_number=1
-------------------------
Train R2: 0.9603141590789825
Test R2: 0.5630395993001918
-------------------------
Train MSE: 1345231766.6817176
Test MSE: 15929572185.755527
-------------------------
Train MAE: 7464.7700804093565
Test MAE: 56318.17850877193
-------------------------
Run Time -0.009109800215810537
KNN K_number=3
-------------------------
Train R2: 0.8339049446041168
Test R2: 0.6403668102543537
-------------------------
Train MSE: 5569051554.773558
Test MSE: 12700383723.601007
-------------------------
Train MAE: 33855.67040692008
Test MAE: 51475.62666666666
-------------------------
Run Time -8.00006091594696e-07
KNN K_number=5
-------------------------
Train R2: 0.7846113700188087
Test R2: 0.6538703946899089
-------------------------
Train MSE: 7230935855.103651
Test MSE: 12082981507.86554
-------------------------
Train MAE: 38905.496372807014
Test MAE: 50647.644602339176
-------------------------
Run Time -4.00003045797348e-07
KNN K_number=7
----------------

KeyboardInterrupt: 

In [None]:
# graphing K vakue with train_score and test_score 
plt.figure(figsize=(12, 8))
plt.plot(K_number, train_score, label='Train Score', marker='o', color='blue')
plt.plot(K_number, test_score, label='Test Score', marker='o', color='red')
plt.title('KNN: Train vs. Test Accuracy for Different K Values')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Accuracy Score')
plt.xticks(K_number)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
## K = 7 is the best K for KNN ALgorithm

### Decision Tree

In [None]:
#splitting x , y data
x=df.drop('Sale_price',axis=1)
y=df['Sale_price']

#Encoding categorical columns
Encoder = ColumnTransformer(
    transformers=[
        ('Encoder_onehot', OneHotEncoder(drop='first'), ['Usage']),
        ('Encoder_bin', BinaryEncoder(), ['Category'])
    ],
    remainder='passthrough'
)

# lists to store resultes of hyperparameter changing 
depth_number=list()
node_number=list()
leaf_number=list()
train_score=list()
test_score=list()

# for loop for each hyperparameter to test differance outcomes
for depth in range(1,100 , 20):
    for leaf in range (1,200 , 20):
        for node in range (2,100,20):
            start= timeit.default_timer()
            steps= list()
            steps.append(("encoder",Encoder))
            steps.append(("RobustScaler",RobustScaler()))
            steps.append(('DECT',DecisionTreeRegressor(
                max_depth=depth,
                min_samples_leaf=leaf,
                max_leaf_nodes=node,
            )))
            # pipe line and cross_validate 
            pipeline= Pipeline(steps=steps)
            scores= cross_validate(pipeline, x, y, cv= 5, scoring= ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"],
                                   return_train_score=True)
            
            node_number.append(node)
            depth_number.append(depth)
            leaf_number.append(leaf)
            train_score.append(scores["train_r2"].mean())
            test_score.append(scores["test_r2"].mean())
            print(f'DECT depth_number={depth}')
            print(f'DECT node_number={leaf}')
            print(f'DECT leaf_number={node}')
            print("-"*25)
            print("Train R2:", scores["train_r2"].mean())
            print("Test R2:", scores["test_r2"].mean())
            print("-"*25)
            print("Run Time", stop - start)
            print("="*50)
            stop= timeit.default_timer()


In [None]:
#convert typerparamters lists to dataframe
results_dict = {
    'depth_number': depth_number,      
    'node_number': node_number,        
    'leaf_number': leaf_number,       
    'train_score': train_score,        
    'test_score': test_score,}
DF=pd.DataFrame(results_dict)
DF

In [None]:
# find the max Test Score and its values
DF[DF['test_score']==DF['test_score'].max()]

In [None]:
## For Decision Tree :
##       max_depth=21,
##       min_samples_leaf=82,
##       max_leaf_nodes=21,
## is the Best Configrations for IT


### Random Forest

In [None]:
#splitting x , y data
x=df.drop('Sale_price',axis=1)
y=df['Sale_price']

#Encoding categorical columns
Encoder = ColumnTransformer(
    transformers=[
        ('Encoder_onehot', OneHotEncoder(drop='first'), ['Usage']),
        ('Encoder_bin', BinaryEncoder(), ['Category'])
    ],
    remainder='passthrough'
)

# lists to store resultes of hyperparameter changing 
n_estimators=list()
max_depth=list()
min_samples_split=list()
min_samples_leaf=list()

train_score=list()
test_score=list()

# for loop for each hyperparameter to test differance outcomes
for n in range(1,100 , 50):
    for depth in range (1,100 , 50):
        for split in range (2,100,50):
            for leaf in range (2,100,50):
                start= timeit.default_timer()
                steps= list()
                steps.append(("encoder",Encoder))
                steps.append(("RobustScaler",RobustScaler()))
                steps.append(('RF',RandomForestRegressor(
                    n_estimators=n,
                    max_depth=depth,
                    min_samples_leaf=leaf,
                    max_leaf_nodes=node,
                )))
                # pipe line and cross_validate 
                pipeline= Pipeline(steps=steps)
                scores= cross_validate(pipeline, x, y, cv= 5, scoring= ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"],
                                       return_train_score=True)
                
                n_estimators.append(n)
                max_depth.append(depth)
                min_samples_split.append(split)
                min_samples_leaf.append(leaf)
                train_score.append(scores["train_r2"].mean())
                test_score.append(scores["test_r2"].mean())
                print(f'RF n_estimators={n}')
                print(f'RF max_depth={depth}')
                print(f'RF min_samples_split={split}')
                print(f'RF min_samples_leaf={leaf}')
                print("-"*25)
                print("Train R2:", scores["train_r2"].mean())
                print("Test R2:", scores["test_r2"].mean())
                print("-"*25)
                print("Run Time", stop - start)
                print("="*50)
                stop= timeit.default_timer()

In [None]:
#convert typerparamters lists to dataframe
results_dict = {
    'n_estimators': n_estimators,     
    'max_depth': max_depth,       
    'min_samples_split': min_samples_split,      
    'min_samples_leaf': min_samples_leaf,        
    'train_score': train_score,
    'test_score': test_score,          
}
DF=pd.DataFrame(results_dict)


In [None]:
# find the max Test Score and its values
DF[DF['test_score']==DF['test_score'].max()]

In [None]:
## For Random Forest :
##       n_estimators=51,
##       max_depth=51,
##       min_samples_split=2,
##       min_samples_leaf=2,
## is the Best Configrations for IT

### XGBRegressor 

In [None]:
#splitting x , y data
x=df.drop('Sale_price',axis=1)
y=df['Sale_price']

#Encoding categorical columns
Encoder = ColumnTransformer(
    transformers=[
        ('Encoder_onehot', OneHotEncoder(drop='first'), ['Usage']),
        ('Encoder_bin', BinaryEncoder(), ['Category'])
    ],
    remainder='passthrough'
)
# lists to store resultes of hyperparameter changing 
n_estimators=list()
learning_rate=[0.01, 0.05,0.1,1]
max_leaves=list()
learning_rate_list=list()
train_score=list()
test_score=list()

# for loop for each hyperparameter to test differance outcomes
for n in range(1,1001 , 200):
    for Lr in learning_rate:
        for leave in range(1,101 , 20):
            start= timeit.default_timer()
            steps= list()
            steps.append(("encoder",Encoder))
            steps.append(("RobustScaler",RobustScaler()))
            steps.append(('XG',XGBRegressor(
                n_estimators=n,
                learning_rate=Lr,
                max_leaves= leave,
            )))
            # pipe line and cross_validate 
            pipeline= Pipeline(steps=steps)
            scores= cross_validate(pipeline, x, y, cv= 5, scoring= ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"],
                                   return_train_score=True)
            
            n_estimators.append(n)
            learning_rate_list.append(Lr)
            max_leaves.append(leave)
            train_score.append(scores["train_r2"].mean())
            test_score.append(scores["test_r2"].mean())
            print(f'XGB n_estimators={n}')
            print(f'XGB learning_rate={Lr}')
            print(f'XGB max_leaves={leave}')
    
            print("-"*25)
            print("Train R2:", scores["train_r2"].mean())
            print("Test R2:", scores["test_r2"].mean())
            print("-"*25)
            print("Run Time", stop - start)
            print("="*50)
            stop= timeit.default_timer()

In [None]:
#convert typerparamters lists to dataframe
results_dict = {
    'n_estimators': n_estimators,      # List of tree depths if you tracked them
    'learning_rate':learning_rate_list,        # List of leaf number  
    'max_leaves':max_leaves,        # List of leaf number  
    'train_score': train_score,        # List of training scores
    'test_score': test_score,          # List of test scores
}
DF=pd.DataFrame(results_dict)


In [None]:
# find the max Test Score and its values
DF[DF['test_score']==DF['test_score'].max()]

In [None]:
## For XGB :
##       n_estimators=801,
##       learning_rate=0.05,
##       max_leaves=81,
## is the Best Configrations for IT

In [None]:
### KNN // test_score : 0.623
### Decision Tree // test_score : 0.625
### Random Forest //  test_score : 0.675
### XGB //test_score : 0.738

## Deploing model

### NOTE : THUS BEST ALGORITHM IS XGB 

### Train Final Model

In [4]:
#splitting x , y data
x=df.drop('Sale_price',axis=1)
y=df['Sale_price']

#Encoding categorical columns
Encoder = ColumnTransformer(
    transformers=[
        ('Encoder_onehot', OneHotEncoder(drop='first'), ['Usage']),
        ('Encoder_bin', BinaryEncoder(), ['Category'])],
    remainder='passthrough')


steps= list()
steps.append(("encoder",Encoder))
steps.append(("RobustScaler",RobustScaler()))
steps.append(('XG',XGBRegressor(
    n_estimators=801,
    learning_rate=0.05,
    max_leaves= 81,
)))

# pipe line and cross_validate 
pipeline= Pipeline(steps=steps)

pipeline.fit(x,y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [31]:
for i in df.columns:
    joblib.dump(df[i].unique(),f'{i}.List')

In [5]:
joblib.dump(Encoder,'pipeline_Pre.h5')

['pipeline_Pre.h5']