# Model training

In [1]:
import pandas as pd
import geopandas as gpd
from pysal.model import spreg
import pickle
import string
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer



## Wczytanie danych

In [2]:
df_laminacja = pickle.load(open('data.pkl', 'rb'))
df_laminacja.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry
0,Laminacja rzęs /farbka ANASTASIA,1g,"Aleje Jerozolimskie 107, 02-011, Warszawa",168.0,POINT (20.9949283 52.225992)
1,Laminacja rzęs /farbka Liza,1g,"Aleje Jerozolimskie 107, 02-011, Warszawa",160.0,POINT (20.9949283 52.225992)


In [3]:
df_hybrydy = pickle.load(open('data_hybrid.pkl', 'rb'))
df_hybrydy.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry
0,Manicure hybrydowy,2g,"Generała Władysława Sikorskiego 14, 09-410, Płock",120.0,POINT (19.7551731 52.5378894)
1,Manicure hybrydowy+ baby boomer,3g,"Generała Władysława Sikorskiego 14, 09-410, Płock",170.0,POINT (19.7551731 52.5378894)


## Preproc danych

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('+', ' ')
    text = text.replace('/',' ')
    text = text.replace('🔥', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(' +', ' ', text)
    text = ''.join([i for i in text if i.isalpha() or i == ' '])
    return text

In [5]:
def convert_to_minutes(duration):
    total_minutes = 0
    if 'g' in duration:
        hours, rest = duration.split('g')
        total_minutes += int(hours) * 60
        duration = rest
    if 'min' in duration:
        minutes = duration.replace('min', '').strip()
        total_minutes += int(minutes)
    return total_minutes

In [6]:
df_laminacja['Description'] = df_laminacja['Description'].apply(preprocess_text)
df_laminacja['Duration'] = df_laminacja['Duration'].apply(convert_to_minutes)
df_laminacja.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry
0,laminacja rzęs farbka anastasia,60,"Aleje Jerozolimskie 107, 02-011, Warszawa",168.0,POINT (20.9949283 52.225992)
1,laminacja rzęs farbka liza,60,"Aleje Jerozolimskie 107, 02-011, Warszawa",160.0,POINT (20.9949283 52.225992)


In [7]:
df_hybrydy['Description'] = df_hybrydy['Description'].apply(preprocess_text)
df_hybrydy['Duration'] = df_hybrydy['Duration'].apply(convert_to_minutes)
df_hybrydy.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry
0,manicure hybrydowy,120,"Generała Władysława Sikorskiego 14, 09-410, Płock",120.0,POINT (19.7551731 52.5378894)
1,manicure hybrydowy baby boomer,180,"Generała Władysława Sikorskiego 14, 09-410, Płock",170.0,POINT (19.7551731 52.5378894)


## Wczytanie geo danych

In [8]:
gdf_laminacja = gpd.GeoDataFrame(df_laminacja, geometry="geometry")
gdf_hybrydy = gpd.GeoDataFrame(df_hybrydy, geometry="geometry")
poland = gpd.read_file('powiaty-max.geojson')
poland_woj = gpd.read_file('wojewodztwa-max.geojson')
gdf_laminacja = gdf_laminacja.set_crs('epsg:4326')
gdf_hybrydy = gdf_hybrydy.set_crs('epsg:4326')

In [9]:
poland_woj.rename(columns = {'nazwa':'District'}, inplace = True) 
poland.rename(columns = {'nazwa':'District'}, inplace = True) 

result_gdf_laminacja_woj = gpd.sjoin(gdf_laminacja, poland_woj, how='left', predicate='within')
result_gdf_laminacja_pow = gpd.sjoin(gdf_laminacja, poland, how='left', predicate='within')

result_gdf_laminacja_woj.drop(columns=['id', 'index_right'], axis=1, inplace=True)
result_gdf_laminacja_pow.drop(columns=['id', 'index_right'], axis=1, inplace=True)

result_gdf_hybrydy_woj = gpd.sjoin(gdf_hybrydy, poland_woj, how='left', predicate='within')
result_gdf_hybrydy_pow = gpd.sjoin(gdf_hybrydy, poland, how='left', predicate='within')

result_gdf_hybrydy_woj.drop(columns=['id', 'index_right'], axis=1, inplace=True)
result_gdf_hybrydy_pow.drop(columns=['id', 'index_right'], axis=1, inplace=True)

In [10]:
result_gdf_hybrydy_woj.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry,District
0,manicure hybrydowy,120,"Generała Władysława Sikorskiego 14, 09-410, Płock",120.0,POINT (19.75517 52.53789),mazowieckie
1,manicure hybrydowy baby boomer,180,"Generała Władysława Sikorskiego 14, 09-410, Płock",170.0,POINT (19.75517 52.53789),mazowieckie


In [11]:
result_gdf_hybrydy_pow.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry,District
0,manicure hybrydowy,120,"Generała Władysława Sikorskiego 14, 09-410, Płock",120.0,POINT (19.75517 52.53789),powiat Płock
1,manicure hybrydowy baby boomer,180,"Generała Władysława Sikorskiego 14, 09-410, Płock",170.0,POINT (19.75517 52.53789),powiat Płock


In [12]:
def cv_function(df, col_name,max_features):
    # Initialize CountVectorizer
    cv_vectorizer = CountVectorizer(max_features=max_features)

    # Fit and transform the 'description' column
    cv_matrix = cv_vectorizer.fit_transform(df[col_name])

    # Convert the TF-IDF matrix to a DataFrame
    cv_df = pd.DataFrame(cv_matrix.toarray(), columns=cv_vectorizer.get_feature_names_out())

    df.reset_index(drop=True, inplace=True)
    cv_df.reset_index(drop=True, inplace=True)

    # Concatenate the original DataFrame with the TF-IDF DataFrame
    df=pd.concat([df, cv_df], axis=1)

    return df

In [13]:
def regression_base(df, dependent_col, independent_cols):
    model_base = spreg.OLS(
        #Dependent variable
        df[[dependent_col]].values,
        #Independent Variable
        df[independent_cols].values,
        #Dependent Variable name
        name_y=dependent_col,
        #Independent Variables names
        name_x=independent_cols,    
    )
    return model_base

In [14]:
def regression_regimes(df, dependent_col, independent_cols, neighborhood_col):
    model = spreg.OLS_Regimes(
        #Dependent variable
        df[[dependent_col]].values,
        #Independent Variables
        df[independent_cols].values,
        #Variable specifying neighborhood
        df[neighborhood_col].to_list(),
        #Allow constatnt term to vary by group/regime
        constant_regi="many",
        #Variables to be allowed to vary (True) or kept
        #constant (False). Here we set all to False
        cols2regi=[False]* len(independent_cols),
        #Allow separate sigma cpefficients to be estimated
        #by regime (False so a single sigma)
        regime_err_sep=False,
        #Dependent Variable name
        name_y=dependent_col,
        #Independent Variables names
        name_x=independent_cols,    
    )
    return model

# Laminacja

## Powiaty

In [15]:
powiaty_laminacja = cv_function(result_gdf_laminacja_pow, 'Description', 3)
columns_to_exclude = ['Location', 'Price','District', 'geometry', 'Description']
variable_names = list(powiaty_laminacja.columns.difference(columns_to_exclude))

In [16]:
powiaty_laminacja.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry,District,brwi,laminacja,rzęs
0,laminacja rzęs farbka anastasia,60,"Aleje Jerozolimskie 107, 02-011, Warszawa",168.0,POINT (20.99493 52.22599),powiat Warszawa,0,1,1
1,laminacja rzęs farbka liza,60,"Aleje Jerozolimskie 107, 02-011, Warszawa",160.0,POINT (20.99493 52.22599),powiat Warszawa,0,1,1


In [17]:
lam_model_base_pow = regression_base(powiaty_laminacja,'Price',variable_names)
print(lam_model_base_pow.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        1323
Mean dependent var  :    146.7015                Number of Variables   :           5
S.D. dependent var  :     54.3118                Degrees of Freedom    :        1318
R-squared           :      0.5767
Adjusted R-squared  :      0.5754
Sum squared residual: 1.65073e+06                F-statistic           :    448.8922
Sigma-square        :    1252.454                Prob(F-statistic)     :  3.531e-244
S.E. of regression  :      35.390                Log likelihood        :   -6593.138
Sigma-square ML     :    1247.721                Akaike info criterion :   13196.276
S.E of regression ML:     35.3231                Schwarz criterion     :   13222.215

------------------------------------------------------------

In [18]:
# Build full table of regression coefficients
base_pow_lam = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": lam_model_base_pow.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": lam_model_base_pow.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in lam_model_base_pow.t_stat],
    },
    index=lam_model_base_pow.name_x
    # Round to four decimals
).round(4)
base_pow_lam

Unnamed: 0,Coeff.,Std. Error,P-Value
CONSTANT,57.3679,7.88,0.0
Duration,0.8126,0.0304,0.0
brwi,57.295,3.4639,0.0
laminacja,12.6788,4.6705,0.0067
rzęs,5.034,5.8315,0.3882


In [19]:
lam_model_regimes_pow = regression_regimes(powiaty_laminacja, 'Price', variable_names, 'District')
print(lam_model_regimes_pow.summary)

  ci_result = sqrt(max_eigval / min_eigval)


REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        1323
Mean dependent var  :    146.7015                Number of Variables   :         194
S.D. dependent var  :     54.3118                Degrees of Freedom    :        1129
R-squared           :      0.6554
Adjusted R-squared  :      0.5965
Sum squared residual: 1.34385e+06                F-statistic           :     11.1251
Sigma-square        :    1190.305                Prob(F-statistic)     :   3.01e-162
S.E. of regression  :      34.501                Log likelihood        :   -6457.082
Sigma-square ML     :    1015.763                Akaike info criterion :   13302.164
S.E of regression ML:     31.8710                Schwarz criterion     :   14308.569

----------------------------------------

In [20]:
# Build full table of regression coefficients
regimes_pow_lam = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": lam_model_regimes_pow.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": lam_model_regimes_pow.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in lam_model_regimes_pow.t_stat],
    },
    index=lam_model_regimes_pow.name_x
    # Round to four decimals
).round(4)
regimes_pow_lam

Unnamed: 0,Coeff.,Std. Error,P-Value
powiat Biała Podlaska_CONSTANT,38.4430,35.3852,0.2775
powiat Białystok_CONSTANT,46.3793,10.5593,0.0000
powiat Bielsko-Biała_CONSTANT,45.9446,14.1023,0.0012
powiat Bydgoszcz_CONSTANT,46.8965,11.5908,0.0001
powiat Bytom_CONSTANT,51.8127,16.1731,0.0014
...,...,...,...
powiat żywiecki_CONSTANT,-1.7041,25.6698,0.9471
_Global_Duration,0.8431,0.0317,0.0000
_Global_brwi,55.0107,3.5226,0.0000
_Global_laminacja,13.6516,4.6882,0.0037


In [21]:
#max features = 4
powiaty_laminacja = cv_function(result_gdf_laminacja_pow, 'Description', 4)
variable_names = list(powiaty_laminacja.columns.difference(columns_to_exclude))

lam_model_regimes_pow_4 = regression_regimes(powiaty_laminacja, 'Price', variable_names, 'District')
print(lam_model_regimes_pow_4.summary)

  ci_result = sqrt(max_eigval / min_eigval)


REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        1323
Mean dependent var  :    146.7015                Number of Variables   :         195
S.D. dependent var  :     54.3118                Degrees of Freedom    :        1128
R-squared           :      0.6567
Adjusted R-squared  :      0.5977
Sum squared residual:  1.3386e+06                F-statistic           :     11.1242
Sigma-square        :    1186.701                Prob(F-statistic)     :  1.343e-162
S.E. of regression  :      34.449                Log likelihood        :   -6454.489
Sigma-square ML     :    1011.790                Akaike info criterion :   13298.979
S.E of regression ML:     31.8086                Schwarz criterion     :   14310.572

----------------------------------------

In [22]:
# Build full table of regression coefficients
regimes_pow_4_lam = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": lam_model_regimes_pow_4.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": lam_model_regimes_pow_4.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in lam_model_regimes_pow_4.t_stat],
    },
    index=lam_model_regimes_pow_4.name_x
    # Round to four decimals
).round(4)
regimes_pow_4_lam

Unnamed: 0,Coeff.,Std. Error,P-Value
powiat Biała Podlaska_CONSTANT,38.7410,35.3318,0.2731
powiat Białystok_CONSTANT,47.3476,10.5533,0.0000
powiat Bielsko-Biała_CONSTANT,47.1858,14.0933,0.0008
powiat Bydgoszcz_CONSTANT,47.2030,11.5742,0.0000
powiat Bytom_CONSTANT,52.0383,16.1489,0.0013
...,...,...,...
_Global_Duration,0.8409,0.0316,0.0000
_Global_brwi,55.3118,3.5202,0.0000
_Global_farbka,-8.2312,3.9110,0.0355
_Global_laminacja,13.3956,4.6827,0.0043


## Województwa

In [23]:
wojewodztwa_laminacja = cv_function(result_gdf_laminacja_woj, 'Description', 5)
columns_to_exclude = ['Location', 'Price','District', 'geometry', 'Description']
variable_names = list(wojewodztwa_laminacja.columns.difference(columns_to_exclude))

In [24]:
wojewodztwa_laminacja.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry,District,brwi,farbka,henna,laminacja,rzęs
0,laminacja rzęs farbka anastasia,60,"Aleje Jerozolimskie 107, 02-011, Warszawa",168.0,POINT (20.99493 52.22599),mazowieckie,0,1,0,1,1
1,laminacja rzęs farbka liza,60,"Aleje Jerozolimskie 107, 02-011, Warszawa",160.0,POINT (20.99493 52.22599),mazowieckie,0,1,0,1,1


In [25]:
lam_model_base_woj = regression_base(wojewodztwa_laminacja,'Price',variable_names)
print(lam_model_base_woj.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        1323
Mean dependent var  :    146.7015                Number of Variables   :           7
S.D. dependent var  :     54.3118                Degrees of Freedom    :        1316
R-squared           :      0.5831
Adjusted R-squared  :      0.5812
Sum squared residual: 1.62568e+06                F-statistic           :    306.7926
Sigma-square        :    1235.320                Prob(F-statistic)     :  6.913e-246
S.E. of regression  :      35.147                Log likelihood        :   -6583.021
Sigma-square ML     :    1228.784                Akaike info criterion :   13180.042
S.E of regression ML:     35.0540                Schwarz criterion     :   13216.356

------------------------------------------------------------

In [26]:
# Build full table of regression coefficients
base_woj_lam = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": lam_model_base_woj.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": lam_model_base_woj.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in lam_model_base_woj.t_stat],
    },
    index=lam_model_base_woj.name_x
    # Round to four decimals
).round(4)
base_woj_lam

Unnamed: 0,Coeff.,Std. Error,P-Value
CONSTANT,57.7598,7.8375,0.0
Duration,0.8041,0.0303,0.0
brwi,55.0241,3.4977,0.0
farbka,-6.9022,3.729,0.0644
henna,16.2693,3.9986,0.0001
laminacja,15.0323,4.6829,0.0014
rzęs,2.598,5.8237,0.6556


In [27]:
lam_model_regimes_woj = regression_regimes(wojewodztwa_laminacja, 'Price', variable_names, 'District')
print(lam_model_regimes_woj.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        1323
Mean dependent var  :    146.7015                Number of Variables   :          22
S.D. dependent var  :     54.3118                Degrees of Freedom    :        1301
R-squared           :      0.6174
Adjusted R-squared  :      0.6112
Sum squared residual: 1.49204e+06                F-statistic           :     99.9668
Sigma-square        :    1146.840                Prob(F-statistic)     :  2.014e-253
S.E. of regression  :      33.865                Log likelihood        :   -6526.276
Sigma-square ML     :    1127.769                Akaike info criterion :   13096.551
S.E of regression ML:     33.5823                Schwarz criterion     :   13210.680

----------------------------------------

In [28]:
# Build full table of regression coefficients
regimes_woj_lam = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": lam_model_regimes_woj.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": lam_model_regimes_woj.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in lam_model_regimes_woj.t_stat],
    },
    index=lam_model_regimes_woj.name_x
    # Round to four decimals
).round(4)
regimes_woj_lam

Unnamed: 0,Coeff.,Std. Error,P-Value
dolnośląskie_CONSTANT,44.9967,7.9793,0.0
kujawsko-pomorskie_CONSTANT,42.1804,9.2508,0.0
lubelskie_CONSTANT,49.8663,10.2497,0.0
lubuskie_CONSTANT,63.1878,10.6562,0.0
mazowieckie_CONSTANT,70.5022,7.7048,0.0
małopolskie_CONSTANT,53.5672,8.2151,0.0
opolskie_CONSTANT,49.1466,9.7456,0.0
podkarpackie_CONSTANT,54.4663,10.3557,0.0
podlaskie_CONSTANT,51.7929,9.8354,0.0
pomorskie_CONSTANT,46.6378,8.4776,0.0


In [29]:
#max features = 7
wojewodztwa_laminacja = cv_function(result_gdf_laminacja_woj, 'Description', 7)
variable_names = list(wojewodztwa_laminacja.columns.difference(columns_to_exclude))

lam_model_regimes_woj_7 = regression_regimes(wojewodztwa_laminacja, 'Price', variable_names, 'District')
print(lam_model_regimes_woj_7.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        1323
Mean dependent var  :    146.7015                Number of Variables   :          24
S.D. dependent var  :     54.3118                Degrees of Freedom    :        1299
R-squared           :      0.6184
Adjusted R-squared  :      0.6117
Sum squared residual: 1.48797e+06                F-statistic           :     91.5373
Sigma-square        :    1145.474                Prob(F-statistic)     :  3.482e-252
S.E. of regression  :      33.845                Log likelihood        :   -6524.469
Sigma-square ML     :    1124.694                Akaike info criterion :   13096.939
S.E of regression ML:     33.5365                Schwarz criterion     :   13221.443

----------------------------------------

In [30]:
# Build full table of regression coefficients
regimes_woj_7_lam = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": lam_model_regimes_woj_7.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": lam_model_regimes_woj_7.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in lam_model_regimes_woj_7.t_stat],
    },
    index=lam_model_regimes_woj_7.name_x
    # Round to four decimals
).round(4)
regimes_woj_7_lam

Unnamed: 0,Coeff.,Std. Error,P-Value
dolnośląskie_CONSTANT,45.2158,7.9855,0.0
kujawsko-pomorskie_CONSTANT,42.1446,9.2546,0.0
lubelskie_CONSTANT,50.4443,10.2702,0.0
lubuskie_CONSTANT,62.8976,10.6559,0.0
mazowieckie_CONSTANT,70.9034,7.7276,0.0
małopolskie_CONSTANT,54.2351,8.2209,0.0
opolskie_CONSTANT,49.3482,9.7437,0.0
podkarpackie_CONSTANT,54.9699,10.353,0.0
podlaskie_CONSTANT,51.8697,9.8354,0.0
pomorskie_CONSTANT,46.6966,8.4801,0.0


# Hybrydy

## Powiaty

In [31]:
powiaty_hybrydy = cv_function(result_gdf_hybrydy_pow, 'Description', 5)
columns_to_exclude = ['Location', 'Price','District', 'geometry', 'Description']
variable_names = list(powiaty_hybrydy.columns.difference(columns_to_exclude))

In [32]:
powiaty_hybrydy.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry,District,french,hybrydowy,kolor,manicure,ze
0,manicure hybrydowy,120,"Generała Władysława Sikorskiego 14, 09-410, Płock",120.0,POINT (19.75517 52.53789),powiat Płock,0,1,0,1,0
1,manicure hybrydowy baby boomer,180,"Generała Władysława Sikorskiego 14, 09-410, Płock",170.0,POINT (19.75517 52.53789),powiat Płock,0,1,0,1,0


In [33]:
hyb_model_base_pow = regression_base(powiaty_hybrydy,'Price',variable_names)
print(hyb_model_base_pow.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        4536
Mean dependent var  :    116.7315                Number of Variables   :           7
S.D. dependent var  :     31.2829                Degrees of Freedom    :        4529
R-squared           :      0.2353
Adjusted R-squared  :      0.2342
Sum squared residual: 3.39398e+06                F-statistic           :    232.2043
Sigma-square        :     749.388                Prob(F-statistic)     :  2.392e-259
S.E. of regression  :      27.375                Log likelihood        :  -21445.276
Sigma-square ML     :     748.231                Akaike info criterion :   42904.553
S.E of regression ML:     27.3538                Schwarz criterion     :   42949.491

------------------------------------------------------------

In [34]:
# Build full table of regression coefficients
base_pow_hyb = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": hyb_model_base_pow.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": hyb_model_base_pow.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in hyb_model_base_pow.t_stat],
    },
    index=hyb_model_base_pow.name_x
    # Round to four decimals
).round(4)
base_pow_hyb

Unnamed: 0,Coeff.,Std. Error,P-Value
CONSTANT,25.9243,7.9849,0.0012
Duration,0.4509,0.0152,0.0
french,12.4331,1.4545,0.0
hybrydowy,53.1245,3.5225,0.0
kolor,-0.3203,1.55,0.8363
manicure,-3.6398,7.3398,0.62
ze,3.436,1.9525,0.0785


In [35]:
hyb_model_regimes_pow = regression_regimes(powiaty_hybrydy, 'Price', variable_names, 'District')
print(hyb_model_regimes_pow.summary)

  ci_result = sqrt(max_eigval / min_eigval)


REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        4536
Mean dependent var  :    116.7315                Number of Variables   :         285
S.D. dependent var  :     31.2829                Degrees of Freedom    :        4251
R-squared           :      0.3754
Adjusted R-squared  :      0.3337
Sum squared residual: 2.77194e+06                F-statistic           :      8.9969
Sigma-square        :     652.067                Prob(F-statistic)     :  3.038e-267
S.E. of regression  :      25.536                Log likelihood        :  -20986.106
Sigma-square ML     :     611.097                Akaike info criterion :   42542.213
S.E of regression ML:     24.7204                Schwarz criterion     :   44371.856

----------------------------------------

In [36]:
# Build full table of regression coefficients
regimes_pow_hyb = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": hyb_model_regimes_pow.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": hyb_model_regimes_pow.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in hyb_model_regimes_pow.t_stat],
    },
    index=hyb_model_regimes_pow.name_x
    # Round to four decimals
).round(4)
regimes_pow_hyb

Unnamed: 0,Coeff.,Std. Error,P-Value
powiat Biała Podlaska_CONSTANT,12.3405,19.5802,0.5286
powiat Białystok_CONSTANT,18.1316,9.0068,0.0442
powiat Bielsko-Biała_CONSTANT,21.6466,8.3309,0.0094
powiat Bydgoszcz_CONSTANT,21.7593,8.0094,0.0066
powiat Bytom_CONSTANT,35.7415,9.6630,0.0002
...,...,...,...
_Global_french,12.5590,1.3917,0.0000
_Global_hybrydowy,53.9296,3.3965,0.0000
_Global_kolor,1.5392,1.5117,0.3086
_Global_manicure,-2.2881,6.9659,0.7426


In [37]:
#max features = 7
powiaty_hybrydy = cv_function(result_gdf_hybrydy_pow, 'Description', 7)
variable_names = list(powiaty_hybrydy.columns.difference(columns_to_exclude))

hyb_model_regimes_pow_7 = regression_regimes(powiaty_hybrydy, 'Price', variable_names, 'District')
print(hyb_model_regimes_pow_7.summary)

  ci_result = sqrt(max_eigval / min_eigval)


REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        4536
Mean dependent var  :    116.7315                Number of Variables   :         287
S.D. dependent var  :     31.2829                Degrees of Freedom    :        4249
R-squared           :      0.3773
Adjusted R-squared  :      0.3354
Sum squared residual: 2.76368e+06                F-statistic           :      9.0009
Sigma-square        :     650.430                Prob(F-statistic)     :  9.731e-269
S.E. of regression  :      25.504                Log likelihood        :  -20979.338
Sigma-square ML     :     609.276                Akaike info criterion :   42532.676
S.E of regression ML:     24.6835                Schwarz criterion     :   44375.159

----------------------------------------

In [38]:
# Build full table of regression coefficients
regimes_pow_7_hyb = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": hyb_model_regimes_pow_7.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": hyb_model_regimes_pow_7.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in hyb_model_regimes_pow_7.t_stat],
    },
    index=hyb_model_regimes_pow_7.name_x
    # Round to four decimals
).round(4)
regimes_pow_7_hyb

Unnamed: 0,Coeff.,Std. Error,P-Value
powiat Biała Podlaska_CONSTANT,11.9788,19.5560,0.5402
powiat Białystok_CONSTANT,17.4324,8.9977,0.0528
powiat Bielsko-Biała_CONSTANT,21.0775,8.3220,0.0114
powiat Bydgoszcz_CONSTANT,21.1156,8.0018,0.0083
powiat Bytom_CONSTANT,35.4098,9.6514,0.0002
...,...,...,...
_Global_hybrydy,4.5254,2.0261,0.0256
_Global_kolor,1.6756,1.5104,0.2673
_Global_manicure,-2.0483,6.9575,0.7685
_Global_nadbudową,6.2251,2.1887,0.0045


## woje

In [39]:
wojewodztwa_hybrydy = cv_function(result_gdf_hybrydy_woj, 'Description', 5)
columns_to_exclude = ['Location', 'Price','District', 'geometry', 'Description']
variable_names = list(wojewodztwa_hybrydy.columns.difference(columns_to_exclude))

In [40]:
wojewodztwa_hybrydy.head(2)

Unnamed: 0,Description,Duration,Location,Price,geometry,District,french,hybrydowy,kolor,manicure,ze
0,manicure hybrydowy,120,"Generała Władysława Sikorskiego 14, 09-410, Płock",120.0,POINT (19.75517 52.53789),mazowieckie,0,1,0,1,0
1,manicure hybrydowy baby boomer,180,"Generała Władysława Sikorskiego 14, 09-410, Płock",170.0,POINT (19.75517 52.53789),mazowieckie,0,1,0,1,0


In [41]:
hyb_model_base_woj = regression_base(wojewodztwa_hybrydy,'Price',variable_names)
print(hyb_model_base_woj.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        4536
Mean dependent var  :    116.7315                Number of Variables   :           7
S.D. dependent var  :     31.2829                Degrees of Freedom    :        4529
R-squared           :      0.2353
Adjusted R-squared  :      0.2342
Sum squared residual: 3.39398e+06                F-statistic           :    232.2043
Sigma-square        :     749.388                Prob(F-statistic)     :  2.392e-259
S.E. of regression  :      27.375                Log likelihood        :  -21445.276
Sigma-square ML     :     748.231                Akaike info criterion :   42904.553
S.E of regression ML:     27.3538                Schwarz criterion     :   42949.491

------------------------------------------------------------

In [42]:
# Build full table of regression coefficients
base_woj_hyb = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": hyb_model_base_woj.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": hyb_model_base_woj.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in hyb_model_base_woj.t_stat],
    },
    index=hyb_model_base_woj.name_x
    # Round to four decimals
).round(4)
base_woj_hyb

Unnamed: 0,Coeff.,Std. Error,P-Value
CONSTANT,25.9243,7.9849,0.0012
Duration,0.4509,0.0152,0.0
french,12.4331,1.4545,0.0
hybrydowy,53.1245,3.5225,0.0
kolor,-0.3203,1.55,0.8363
manicure,-3.6398,7.3398,0.62
ze,3.436,1.9525,0.0785


In [43]:
hyb_model_regimes_woj = regression_regimes(wojewodztwa_hybrydy, 'Price', variable_names, 'District')
print(hyb_model_regimes_woj.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        4536
Mean dependent var  :    116.7315                Number of Variables   :          22
S.D. dependent var  :     31.2829                Degrees of Freedom    :        4514
R-squared           :      0.2716
Adjusted R-squared  :      0.2682
Sum squared residual: 3.23255e+06                F-statistic           :     80.1607
Sigma-square        :     716.117                Prob(F-statistic)     :  5.944e-291
S.E. of regression  :      26.760                Log likelihood        :  -21334.755
Sigma-square ML     :     712.644                Akaike info criterion :   42713.510
S.E of regression ML:     26.6954                Schwarz criterion     :   42854.746

----------------------------------------

  ci_result = sqrt(max_eigval / min_eigval)


In [44]:
# Build full table of regression coefficients
regimes_woj_hyb = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": hyb_model_regimes_woj.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": hyb_model_regimes_woj.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in hyb_model_regimes_woj.t_stat],
    },
    index=hyb_model_regimes_woj.name_x
    # Round to four decimals
).round(4)
regimes_woj_hyb

Unnamed: 0,Coeff.,Std. Error,P-Value
dolnośląskie_CONSTANT,25.4517,7.9734,0.0014
kujawsko-pomorskie_CONSTANT,23.9263,7.9804,0.0027
lubelskie_CONSTANT,17.0768,8.0775,0.0346
lubuskie_CONSTANT,25.73,8.3284,0.002
mazowieckie_CONSTANT,36.0513,7.8588,0.0
małopolskie_CONSTANT,25.4784,7.917,0.0013
opolskie_CONSTANT,29.1034,8.5808,0.0007
podkarpackie_CONSTANT,21.7989,8.0771,0.007
podlaskie_CONSTANT,35.7462,8.6379,0.0
pomorskie_CONSTANT,29.2995,7.9043,0.0002


In [45]:
#max features = 7
wojewodztwa_hybrydy = cv_function(result_gdf_hybrydy_woj, 'Description', 7)
variable_names = list(wojewodztwa_hybrydy.columns.difference(columns_to_exclude))

hyb_model_regimes_woj_7 = regression_regimes(wojewodztwa_hybrydy, 'Price', variable_names, 'District')
print(hyb_model_regimes_woj_7.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES - REGIMES
---------------------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  :       Price                Number of Observations:        4536
Mean dependent var  :    116.7315                Number of Variables   :          24
S.D. dependent var  :     31.2829                Degrees of Freedom    :        4512
R-squared           :      0.2743
Adjusted R-squared  :      0.2706
Sum squared residual: 3.22057e+06                F-statistic           :     74.1596
Sigma-square        :     713.779                Prob(F-statistic)     :  1.219e-292
S.E. of regression  :      26.717                Log likelihood        :  -21326.334
Sigma-square ML     :     710.003                Akaike info criterion :   42700.669
S.E of regression ML:     26.6459                Schwarz criterion     :   42854.744

----------------------------------------

In [46]:
# Build full table of regression coefficients
regimes_woj_7_hyb = pd.DataFrame(
    {
        # Pull out regression coefficients and
        # flatten as they are returned as Nx1 array
        "Coeff.": hyb_model_regimes_woj_7.betas.flatten(),
        # Pull out and flatten standard errors
        "Std. Error": hyb_model_regimes_woj_7.std_err.flatten(),
        # Pull out P-values from t-stat object
        "P-Value": [i[1] for i in hyb_model_regimes_woj_7.t_stat],
    },
    index=hyb_model_regimes_woj_7.name_x
    # Round to four decimals
).round(4)
regimes_woj_7_hyb

Unnamed: 0,Coeff.,Std. Error,P-Value
dolnośląskie_CONSTANT,24.9183,7.9615,0.0018
kujawsko-pomorskie_CONSTANT,23.0286,7.9705,0.0039
lubelskie_CONSTANT,16.3734,8.0663,0.0424
lubuskie_CONSTANT,25.2248,8.3157,0.0024
mazowieckie_CONSTANT,34.8383,7.8519,0.0
małopolskie_CONSTANT,24.7297,7.9062,0.0018
opolskie_CONSTANT,28.5042,8.5683,0.0009
podkarpackie_CONSTANT,21.1215,8.0656,0.0089
podlaskie_CONSTANT,35.1107,8.6252,0.0
pomorskie_CONSTANT,28.4294,7.8943,0.0003


# Podsumowanie modeli z $R^2$

In [47]:
df_lam_pow = pd.DataFrame(
    [[lam_model_base_pow.r2, lam_model_base_pow.ar2], [lam_model_regimes_pow.r2, lam_model_regimes_pow.ar2], [lam_model_regimes_pow_4.r2, lam_model_regimes_pow_4.ar2]],
    index=["OLS", "OLS_Regimes (CV features = 3)", "OLS_Regimes (CV features = 4)"],
    columns=["R2", "Adj. R2"],
)

df_hyb_pow = pd.DataFrame(
    [[hyb_model_base_pow.r2, hyb_model_base_pow.ar2], [hyb_model_regimes_pow.r2, hyb_model_regimes_pow.ar2], [hyb_model_regimes_pow_7.r2, hyb_model_regimes_pow_7.ar2]],
    index=["OLS", "OLS_Regimes (CV features = 5)", "OLS_Regimes (CV features = 7)"],
    columns=["R2", "Adj. R2"],
)

df_lam_woj = pd.DataFrame(
    [[lam_model_base_woj.r2, lam_model_base_woj.ar2], [lam_model_regimes_woj.r2, lam_model_regimes_woj.ar2], [lam_model_regimes_woj_7.r2, lam_model_regimes_woj_7.ar2]],
    index=["OLS", "OLS_Regimes (CV features = 5)", "OLS_Regimes (CV features = 7)"],
    columns=["R2", "Adj. R2"],
)

df_hyb_woj = pd.DataFrame(
    [[hyb_model_base_woj.r2, hyb_model_base_woj.ar2], [hyb_model_regimes_woj.r2, hyb_model_regimes_woj.ar2], [hyb_model_regimes_woj_7.r2, hyb_model_regimes_woj_7.ar2]],
    index=["OLS", "OLS_Regimes (CV features = 5)", "OLS_Regimes (CV features = 7)"],
    columns=["R2", "Adj. R2"],
)

In [51]:
print('Laminacja rzęs - Regimes = Powiaty')
display(df_lam_pow)
print('Manicure hybrydowy - Regimes = Powiaty')
display(df_hyb_pow)
print('Laminacja rzęs - Regimes = Województwa')
display(df_lam_woj)
print('Manicure hybrydowy - Regimes = Województwa')
display(df_hyb_woj)


Laminacja rzęs - Regimes = Powiaty


Unnamed: 0,R2,Adj. R2
OLS,0.576692,0.575407
OLS_Regimes (CV features = 3),0.655387,0.596476
OLS_Regimes (CV features = 4),0.656735,0.597698


Manicure hybrydowy - Regimes = Powiaty


Unnamed: 0,R2,Adj. R2
OLS,0.235254,0.234241
OLS_Regimes (CV features = 5),0.375415,0.333687
OLS_Regimes (CV features = 7),0.377276,0.33536


Laminacja rzęs - Regimes = Województwa


Unnamed: 0,R2,Adj. R2
OLS,0.583116,0.581216
OLS_Regimes (CV features = 5),0.617387,0.611211
OLS_Regimes (CV features = 7),0.61843,0.611674


Manicure hybrydowy - Regimes = Województwa


Unnamed: 0,R2,Adj. R2
OLS,0.235254,0.234241
OLS_Regimes (CV features = 5),0.271627,0.268238
OLS_Regimes (CV features = 7),0.274326,0.270627
