In [7]:
from helper import get_training_data, train_lasso_regression, plot_result_correlation, get_best_lasso_model, train_xgboost,  get_best_pca_lasso_model, train_mean_model
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn import metrics
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from helper import prepare_socios

%matplotlib inline
%load_ext autoreload
%autoreload 2
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Best models

#### Unemployment rate

In [8]:
results = pd.DataFrame(columns = ['improvement_lasso', 'improvement_lasso_boosted', 'improvement_pca_lasso', 'improvement_pca_lasso_boosted', 'improvement_mean_model'])
results_mse = pd.DataFrame(columns = ['mse_naive','mse_lasso', 'mse_lasso_boosted', 'mse_pca_lasso', 'mse_pca_lasso_boosted', 'mse_mean_model'])
results_r2 = pd.DataFrame(columns = ['r2_naive', 'r2_lasso', 'r2_lasso_boosted', 'r2_pca_lasso', 'r2_pca_lasso_boosted', 'r2_mean_model'])
target = 'unemployment_rate'
dens_type = 'count'
radius = 500

for city in ['marseille', 'lyon', 'paris']:
    country = 'FR'
    agg = get_training_data(city, country, radius, dens_type,  2015)
    
    # Lasso:
    predicts = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='predicts')
    predicts_master = predicts.copy()
    predicts_master = predicts_master.rename(columns = {'y_pred': 'y_pred_lasso'})

    # Lasso Boosted
    cols = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='used_columns')
    predicts = train_xgboost(agg, target, cols, 'predicts')
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred

    # PCA Lasso
    predicts = get_best_pca_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius,output = 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso'] = predicts.y_pred

    # PCA Lasso boosted
    comps = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'components')
    cols = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols, 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso', 'y_pred_lasso_boosted', 'y_pred_pca_lasso', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)


    targets = ['unemployment_rate', 'income_levels', 'foreign_nationals']
    scaler = get_training_data(city, country, radius, dens_type, 2015, 'scaler')
    scaler_new = RobustScaler()
    scaler_new.center_, scaler_new.scale_ = scaler.center_[targets.index(target)], scaler.scale_[targets.index(target)]
    predicts_master = pd.DataFrame(scaler_new.inverse_transform(predicts_master), columns =predicts_master.columns)


    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.naive) 
    mse_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso)
    mse_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    mse_pca_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    mse_pca_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)

    naive_r2 = metrics.r2_score(predicts_master.y_test, predicts_master.naive) 
    r2_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso)
    r2_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    r2_pca_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    r2_pca_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    r2_mean_model = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)
    
    improvement_lasso = 100 - (mse_lasso/naive_mse)*100
    improvement_lasso_boosted = 100 - (mse_lasso_boosted /naive_mse)*100
    improvement_pca_lasso = 100 - (mse_pca_lasso/naive_mse)*100
    improvement_pca_lasso_boosted = 100 - (mse_pca_lasso_boosted/naive_mse)*100
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    results.loc[city,:]=[improvement_lasso, improvement_lasso_boosted, improvement_pca_lasso, improvement_pca_lasso_boosted, improvement_mean_model]
    results_mse.loc[city,:]=[naive_mse, mse_lasso, mse_lasso_boosted, mse_pca_lasso, mse_pca_lasso_boosted, mse_mean_model]
    results_r2.loc[city,:]=[naive_r2, r2_lasso, r2_lasso_boosted, r2_pca_lasso, r2_pca_lasso_boosted, r2_mean_model]
    
for city in ['hamburg', 'berlin', 'bremen']:
    country = 'DE'
    agg = get_training_data(city, country, radius, dens_type,  2015)
    
    # Lasso:
    predicts = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='predicts')
    predicts_master = predicts.copy()
    predicts_master = predicts_master.rename(columns = {'y_pred': 'y_pred_lasso'})

    # Lasso Boosted
    cols = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='used_columns')
    predicts = train_xgboost(agg, target, cols, 'predicts')
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred

    # PCA Lasso
    predicts = get_best_pca_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius,output = 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso'] = predicts.y_pred

    # PCA Lasso boosted
    comps = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'components')
    cols = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols, 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso', 'y_pred_lasso_boosted', 'y_pred_pca_lasso', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)

    targets = ['unemployment_rate', 'income_levels', 'foreign_nationals']
    scaler = get_training_data(city, country, radius, dens_type, 2015, 'scaler')
    scaler_new = RobustScaler()
    scaler_new.center_, scaler_new.scale_ = scaler.center_[targets.index(target)], scaler.scale_[targets.index(target)]
    predicts_master = pd.DataFrame(scaler_new.inverse_transform(predicts_master), columns =predicts_master.columns)


    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.naive) 
    mse_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso)
    mse_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    mse_pca_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    mse_pca_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    
    naive_r2 = metrics.r2_score(predicts_master.y_test, predicts_master.naive) 
    r2_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso)
    r2_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    r2_pca_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    r2_pca_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    r2_mean_model = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    improvement_lasso = 100 - (mse_lasso/naive_mse)*100
    improvement_lasso_boosted = 100 - (mse_lasso_boosted /naive_mse)*100
    improvement_pca_lasso = 100 - (mse_pca_lasso/naive_mse)*100
    improvement_pca_lasso_boosted = 100 - (mse_pca_lasso_boosted/naive_mse)*100
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    results.loc[city,:]=[improvement_lasso, improvement_lasso_boosted, improvement_pca_lasso, improvement_pca_lasso_boosted, improvement_mean_model]
    results_mse.loc[city,:]=[naive_mse, mse_lasso, mse_lasso_boosted, mse_pca_lasso, mse_pca_lasso_boosted, mse_mean_model]
    results_r2.loc[city,:]=[naive_r2, r2_lasso, r2_lasso_boosted, r2_pca_lasso, r2_pca_lasso_boosted, r2_mean_model]
    
results.loc['mean'] = results.mean()
results

shape of training data (344, 241)
0.1
0.1
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.1
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.1
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.1
shape of training data (344, 241)
shape of training data (176, 241)
0.1
0.1
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.2
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.2
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.2
shape of training data (176, 241)
shape of training data (861, 241)
0.01
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861, 149)
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861, 149)
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861, 149)

Unnamed: 0,improvement_lasso,improvement_lasso_boosted,improvement_pca_lasso,improvement_pca_lasso_boosted,improvement_mean_model
marseille,36.237994,46.617391,43.62065,30.827761,44.048386
lyon,31.843344,24.791114,33.00613,7.729169,22.211441
paris,33.411697,35.641824,35.80564,33.906922,36.824347
hamburg,37.845791,44.28745,36.532992,33.509673,50.470966
berlin,20.572959,29.959573,31.670685,37.933511,38.420876
bremen,39.88469,53.157809,46.909647,49.527599,58.580892
mean,33.299412,39.07586,37.924291,32.239106,41.759485


In [9]:
results_r2

Unnamed: 0,r2_naive,r2_lasso,r2_lasso_boosted,r2_pca_lasso,r2_pca_lasso_boosted,r2_mean_model
marseille,-0.001364,0.361511,0.465446,0.435438,0.307334,0.439721
lyon,-0.001873,0.317157,0.246502,0.328806,0.075563,0.220657
paris,-0.014116,0.324717,0.347333,0.348994,0.329739,0.359325
hamburg,-0.124421,0.301125,0.373556,0.286364,0.252369,0.443085
berlin,-0.00026,0.205523,0.299414,0.316529,0.379174,0.384049
bremen,-0.043206,0.372873,0.511339,0.446158,0.473469,0.567913


In [10]:
results_mse

Unnamed: 0,mse_naive,mse_lasso,mse_lasso_boosted,mse_pca_lasso,mse_pca_lasso_boosted,mse_mean_model
marseille,0.010888,0.006942,0.005812,0.006138,0.007531,0.006092
lyon,0.002615,0.001782,0.001967,0.001752,0.002413,0.002034
paris,0.001694,0.001128,0.00109,0.001088,0.00112,0.00107
hamburg,0.000365,0.000227,0.000203,0.000232,0.000243,0.000181
berlin,0.000859,0.000682,0.000601,0.000587,0.000533,0.000529
bremen,0.001918,0.001153,0.000899,0.001018,0.000968,0.000795


In [11]:
results_rmse = results_mse**(0.5)
results_rmse = results_rmse.rename(columns = {'mse_naive': 'rmse_naive','mse_lasso':'rmse_lasso', 'mse_lasso_boosted':'rmse_lasso_boosted', 'mse_pca_lasso':'rmse_pca_lasso', 'mse_pca_lasso_boosted':'rmse_pca_lasso_boosted', 'mse_mean_model':'rmse_mean_model'})
results_rmse

Unnamed: 0,rmse_naive,rmse_lasso,rmse_lasso_boosted,rmse_pca_lasso,rmse_pca_lasso_boosted,rmse_mean_model
marseille,0.104344,0.08332,0.076237,0.078348,0.086782,0.07805
lyon,0.051135,0.042216,0.044346,0.041854,0.04912,0.0451
paris,0.041161,0.033588,0.033021,0.032979,0.033463,0.032716
hamburg,0.0191,0.015058,0.014257,0.015217,0.015575,0.013442
berlin,0.029304,0.026116,0.024525,0.024223,0.023086,0.022996
bremen,0.043799,0.033959,0.029977,0.031914,0.031117,0.028188


In [12]:
output = pd.concat([results, results_r2, results_mse, results_rmse], axis = 1, join = 'inner')
output = output[['mse_naive', 'rmse_naive', 'r2_naive',
               'mse_lasso', 'rmse_lasso', 'r2_lasso', 'improvement_lasso',
               'mse_lasso_boosted', 'rmse_lasso_boosted', 'r2_lasso_boosted', 'improvement_lasso_boosted',
               'mse_pca_lasso', 'rmse_pca_lasso', 'r2_pca_lasso', 'improvement_pca_lasso',
               'mse_pca_lasso_boosted', 'rmse_pca_lasso_boosted', 'r2_pca_lasso_boosted', 'improvement_pca_lasso_boosted',
               'mse_mean_model', 'rmse_mean_model', 'r2_mean_model', 'improvement_mean_model']]
output.to_excel(f'output/results/results_{target}_{dens_type}_{radius}.xlsx')
output

Unnamed: 0,mse_naive,rmse_naive,r2_naive,mse_lasso,rmse_lasso,r2_lasso,improvement_lasso,mse_lasso_boosted,rmse_lasso_boosted,r2_lasso_boosted,...,r2_pca_lasso,improvement_pca_lasso,mse_pca_lasso_boosted,rmse_pca_lasso_boosted,r2_pca_lasso_boosted,improvement_pca_lasso_boosted,mse_mean_model,rmse_mean_model,r2_mean_model,improvement_mean_model
marseille,0.010888,0.104344,-0.001364,0.006942,0.08332,0.361511,36.237994,0.005812,0.076237,0.465446,...,0.435438,43.62065,0.007531,0.086782,0.307334,30.827761,0.006092,0.07805,0.439721,44.048386
lyon,0.002615,0.051135,-0.001873,0.001782,0.042216,0.317157,31.843344,0.001967,0.044346,0.246502,...,0.328806,33.00613,0.002413,0.04912,0.075563,7.729169,0.002034,0.0451,0.220657,22.211441
paris,0.001694,0.041161,-0.014116,0.001128,0.033588,0.324717,33.411697,0.00109,0.033021,0.347333,...,0.348994,35.80564,0.00112,0.033463,0.329739,33.906922,0.00107,0.032716,0.359325,36.824347
hamburg,0.000365,0.0191,-0.124421,0.000227,0.015058,0.301125,37.845791,0.000203,0.014257,0.373556,...,0.286364,36.532992,0.000243,0.015575,0.252369,33.509673,0.000181,0.013442,0.443085,50.470966
berlin,0.000859,0.029304,-0.00026,0.000682,0.026116,0.205523,20.572959,0.000601,0.024525,0.299414,...,0.316529,31.670685,0.000533,0.023086,0.379174,37.933511,0.000529,0.022996,0.384049,38.420876
bremen,0.001918,0.043799,-0.043206,0.001153,0.033959,0.372873,39.88469,0.000899,0.029977,0.511339,...,0.446158,46.909647,0.000968,0.031117,0.473469,49.527599,0.000795,0.028188,0.567913,58.580892


#### Foreign Nationals

In [13]:
results = pd.DataFrame(columns = ['improvement_lasso', 'improvement_lasso_boosted', 'improvement_pca_lasso', 'improvement_pca_lasso_boosted', 'improvement_mean_model'])
results_mse = pd.DataFrame(columns = ['mse_naive','mse_lasso', 'mse_lasso_boosted', 'mse_pca_lasso', 'mse_pca_lasso_boosted', 'mse_mean_model'])
results_r2 = pd.DataFrame(columns = ['r2_naive', 'r2_lasso', 'r2_lasso_boosted', 'r2_pca_lasso', 'r2_pca_lasso_boosted', 'r2_mean_model'])
target = 'foreign_nationals'
dens_type = 'count'
radius = 500

for city in ['marseille', 'lyon', 'paris']:
    country = 'FR'
    agg = get_training_data(city, country, radius, dens_type,  2015)
    
    # Lasso:
    predicts = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='predicts')
    predicts_master = predicts.copy()
    predicts_master = predicts_master.rename(columns = {'y_pred': 'y_pred_lasso'})

    # Lasso Boosted
    cols = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='used_columns')
    predicts = train_xgboost(agg, target, cols, 'predicts')
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred

    # PCA Lasso
    predicts = get_best_pca_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius,output = 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso'] = predicts.y_pred

    # PCA Lasso boosted
    comps = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'components')
    cols = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols, 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso', 'y_pred_lasso_boosted', 'y_pred_pca_lasso', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)


    targets = ['unemployment_rate', 'income_levels', 'foreign_nationals']
    scaler = get_training_data(city, country, radius, dens_type, 2015, 'scaler')
    scaler_new = RobustScaler()
    scaler_new.center_, scaler_new.scale_ = scaler.center_[targets.index(target)], scaler.scale_[targets.index(target)]
    predicts_master = pd.DataFrame(scaler_new.inverse_transform(predicts_master), columns =predicts_master.columns)


    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.naive) 
    mse_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso)
    mse_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    mse_pca_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    mse_pca_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    
    naive_r2 = metrics.r2_score(predicts_master.y_test, predicts_master.naive) 
    r2_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso)
    r2_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    r2_pca_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    r2_pca_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    r2_mean_model = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    improvement_lasso = 100 - (mse_lasso/naive_mse)*100
    improvement_lasso_boosted = 100 - (mse_lasso_boosted /naive_mse)*100
    improvement_pca_lasso = 100 - (mse_pca_lasso/naive_mse)*100
    improvement_pca_lasso_boosted = 100 - (mse_pca_lasso_boosted/naive_mse)*100
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    results.loc[city,:]=[improvement_lasso, improvement_lasso_boosted, improvement_pca_lasso, improvement_pca_lasso_boosted, improvement_mean_model]
    results_mse.loc[city,:]=[naive_mse, mse_lasso, mse_lasso_boosted, mse_pca_lasso, mse_pca_lasso_boosted, mse_mean_model]
    results_r2.loc[city,:]=[naive_r2, r2_lasso, r2_lasso_boosted, r2_pca_lasso, r2_pca_lasso_boosted, r2_mean_model]
    
for city in ['hamburg', 'berlin', 'bremen']:
    country = 'DE'
    agg = get_training_data(city, country, radius, dens_type,  2015)
    
    # Lasso:
    predicts = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='predicts')
    predicts_master = predicts.copy()
    predicts_master = predicts_master.rename(columns = {'y_pred': 'y_pred_lasso'})

    # Lasso Boosted
    cols = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='used_columns')
    predicts = train_xgboost(agg, target, cols, 'predicts')
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred

    # PCA Lasso
    predicts = get_best_pca_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius,output = 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso'] = predicts.y_pred

    # PCA Lasso boosted
    comps = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'components')
    cols = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols, 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso', 'y_pred_lasso_boosted', 'y_pred_pca_lasso', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)

    targets = ['unemployment_rate', 'income_levels', 'foreign_nationals']
    scaler = get_training_data(city, country, radius, dens_type, 2015, 'scaler')
    scaler_new = RobustScaler()
    scaler_new.center_, scaler_new.scale_ = scaler.center_[targets.index(target)], scaler.scale_[targets.index(target)]
    predicts_master = pd.DataFrame(scaler_new.inverse_transform(predicts_master), columns =predicts_master.columns)


    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.naive) 
    mse_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso)
    mse_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    mse_pca_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    mse_pca_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    
    naive_r2 = metrics.r2_score(predicts_master.y_test, predicts_master.naive) 
    r2_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso)
    r2_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    r2_pca_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    r2_pca_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    r2_mean_model = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    improvement_lasso = 100 - (mse_lasso/naive_mse)*100
    improvement_lasso_boosted = 100 - (mse_lasso_boosted /naive_mse)*100
    improvement_pca_lasso = 100 - (mse_pca_lasso/naive_mse)*100
    improvement_pca_lasso_boosted = 100 - (mse_pca_lasso_boosted/naive_mse)*100
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    results.loc[city,:]=[improvement_lasso, improvement_lasso_boosted, improvement_pca_lasso, improvement_pca_lasso_boosted, improvement_mean_model]
    results_mse.loc[city,:]=[naive_mse, mse_lasso, mse_lasso_boosted, mse_pca_lasso, mse_pca_lasso_boosted, mse_mean_model]
    results_r2.loc[city,:]=[naive_r2, r2_lasso, r2_lasso_boosted, r2_pca_lasso, r2_pca_lasso_boosted, r2_mean_model]

results.loc['mean'] = results.mean()
results

shape of training data (344, 241)
0.01
0.01
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.01
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.01
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.01
shape of training data (344, 241)
shape of training data (176, 241)
0.1
0.1
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.1
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.1
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.1
shape of training data (176, 241)
shape of training data (861, 241)
0.01
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861, 149)
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861, 149)
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861,

Unnamed: 0,improvement_lasso,improvement_lasso_boosted,improvement_pca_lasso,improvement_pca_lasso_boosted,improvement_mean_model
marseille,76.39919,76.004913,75.980939,56.636859,71.805086
lyon,29.984223,32.686478,33.155555,12.918532,25.830271
paris,31.701701,28.600894,34.219826,27.258995,31.350111
hamburg,33.929667,-0.037406,34.308746,1.667621,17.818021
berlin,-4.849867,48.718819,13.097955,41.454991,59.031382
bremen,16.322966,36.230962,48.996513,58.450928,53.64617
mean,30.581314,37.03411,39.959922,33.064654,43.24684


In [14]:
results_r2

Unnamed: 0,r2_naive,r2_lasso,r2_lasso_boosted,r2_pca_lasso,r2_pca_lasso_boosted,r2_mean_model
marseille,-0.026125,0.757826,0.75378,0.753534,0.55504,0.710685
lyon,-0.002119,0.298359,0.325439,0.330139,0.12734,0.256731
paris,-0.021716,0.302185,0.270504,0.327913,0.256794,0.298593
hamburg,-0.26129,0.166661,-0.261762,0.171443,-0.240257,-0.036553
berlin,-0.033907,-0.084051,0.4698,0.101513,0.394699,0.576422
bremen,-0.012699,0.152604,0.354212,0.483488,0.579233,0.530575


In [15]:
results_mse

Unnamed: 0,mse_naive,mse_lasso,mse_lasso_boosted,mse_pca_lasso,mse_pca_lasso_boosted,mse_mean_model
marseille,0.009139,0.002157,0.002193,0.002195,0.003963,0.002577
lyon,0.002496,0.001748,0.00168,0.001669,0.002174,0.001851
paris,0.002873,0.001962,0.002051,0.00189,0.00209,0.001972
hamburg,0.005245,0.003465,0.005247,0.003446,0.005158,0.00431
berlin,0.003961,0.004153,0.002031,0.003443,0.002319,0.001623
bremen,0.006663,0.005576,0.004249,0.003398,0.002768,0.003089


In [16]:
results_rmse = results_mse**(0.5)
results_rmse = results_rmse.rename(columns = {'mse_naive': 'rmse_naive','mse_lasso':'rmse_lasso', 'mse_lasso_boosted':'rmse_lasso_boosted', 'mse_pca_lasso':'rmse_pca_lasso', 'mse_pca_lasso_boosted':'rmse_pca_lasso_boosted', 'mse_mean_model':'rmse_mean_model'})
results_rmse

Unnamed: 0,rmse_naive,rmse_lasso,rmse_lasso_boosted,rmse_pca_lasso,rmse_pca_lasso_boosted,rmse_mean_model
marseille,0.095596,0.046441,0.046827,0.046851,0.06295,0.05076
lyon,0.049962,0.041806,0.040991,0.040848,0.046623,0.043028
paris,0.053601,0.044298,0.045292,0.043473,0.045716,0.044411
hamburg,0.072423,0.058868,0.072436,0.058699,0.071816,0.065654
berlin,0.062939,0.064448,0.045071,0.058673,0.048158,0.040285
bremen,0.081628,0.07467,0.065185,0.058296,0.052617,0.055576


In [17]:
output = pd.concat([results, results_r2, results_mse, results_rmse], axis = 1, join = 'inner')
output = output[['mse_naive', 'rmse_naive', 'r2_naive',
               'mse_lasso', 'rmse_lasso', 'r2_lasso', 'improvement_lasso',
               'mse_lasso_boosted', 'rmse_lasso_boosted', 'r2_lasso_boosted', 'improvement_lasso_boosted',
               'mse_pca_lasso', 'rmse_pca_lasso', 'r2_pca_lasso', 'improvement_pca_lasso',
               'mse_pca_lasso_boosted', 'rmse_pca_lasso_boosted', 'r2_pca_lasso_boosted', 'improvement_pca_lasso_boosted',
               'mse_mean_model', 'rmse_mean_model', 'r2_mean_model', 'improvement_mean_model']]
output.to_excel(f'output/results/results_{target}_{dens_type}_{radius}.xlsx')
output

Unnamed: 0,mse_naive,rmse_naive,r2_naive,mse_lasso,rmse_lasso,r2_lasso,improvement_lasso,mse_lasso_boosted,rmse_lasso_boosted,r2_lasso_boosted,...,r2_pca_lasso,improvement_pca_lasso,mse_pca_lasso_boosted,rmse_pca_lasso_boosted,r2_pca_lasso_boosted,improvement_pca_lasso_boosted,mse_mean_model,rmse_mean_model,r2_mean_model,improvement_mean_model
marseille,0.009139,0.095596,-0.026125,0.002157,0.046441,0.757826,76.39919,0.002193,0.046827,0.75378,...,0.753534,75.980939,0.003963,0.06295,0.55504,56.636859,0.002577,0.05076,0.710685,71.805086
lyon,0.002496,0.049962,-0.002119,0.001748,0.041806,0.298359,29.984223,0.00168,0.040991,0.325439,...,0.330139,33.155555,0.002174,0.046623,0.12734,12.918532,0.001851,0.043028,0.256731,25.830271
paris,0.002873,0.053601,-0.021716,0.001962,0.044298,0.302185,31.701701,0.002051,0.045292,0.270504,...,0.327913,34.219826,0.00209,0.045716,0.256794,27.258995,0.001972,0.044411,0.298593,31.350111
hamburg,0.005245,0.072423,-0.26129,0.003465,0.058868,0.166661,33.929667,0.005247,0.072436,-0.261762,...,0.171443,34.308746,0.005158,0.071816,-0.240257,1.667621,0.00431,0.065654,-0.036553,17.818021
berlin,0.003961,0.062939,-0.033907,0.004153,0.064448,-0.084051,-4.849867,0.002031,0.045071,0.4698,...,0.101513,13.097955,0.002319,0.048158,0.394699,41.454991,0.001623,0.040285,0.576422,59.031382
bremen,0.006663,0.081628,-0.012699,0.005576,0.07467,0.152604,16.322966,0.004249,0.065185,0.354212,...,0.483488,48.996513,0.002768,0.052617,0.579233,58.450928,0.003089,0.055576,0.530575,53.64617


#### Income levels

In [18]:
results = pd.DataFrame(columns = ['improvement_lasso', 'improvement_lasso_boosted', 'improvement_pca_lasso', 'improvement_pca_lasso_boosted', 'improvement_mean_model'])
results_mse = pd.DataFrame(columns = ['mse_naive','mse_lasso', 'mse_lasso_boosted', 'mse_pca_lasso', 'mse_pca_lasso_boosted', 'mse_mean_model'])
results_r2 = pd.DataFrame(columns = ['r2_naive', 'r2_lasso', 'r2_lasso_boosted', 'r2_pca_lasso', 'r2_pca_lasso_boosted', 'r2_mean_model'])
target = 'income_levels'
dens_type = 'count'
radius = 500

for city in ['marseille', 'lyon', 'paris']:
    country = 'FR'
    agg = get_training_data(city, country, radius, dens_type,  2015)
    
    # Lasso:
    predicts = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='predicts')
    predicts_master = predicts.copy()
    predicts_master = predicts_master.rename(columns = {'y_pred': 'y_pred_lasso'})

    # Lasso Boosted
    cols = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='used_columns')
    predicts = train_xgboost(agg, target, cols, 'predicts')
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred

    # PCA Lasso
    predicts = get_best_pca_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius,output = 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso'] = predicts.y_pred

    # PCA Lasso boosted
    comps = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'components')
    cols = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols, 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso', 'y_pred_lasso_boosted', 'y_pred_pca_lasso', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)


    targets = ['unemployment_rate', 'income_levels', 'foreign_nationals']
    scaler = get_training_data(city, country, radius, dens_type, 2015, 'scaler')
    scaler_new = RobustScaler()
    scaler_new.center_, scaler_new.scale_ = scaler.center_[targets.index(target)], scaler.scale_[targets.index(target)]
    predicts_master = pd.DataFrame(scaler_new.inverse_transform(predicts_master), columns =predicts_master.columns)


    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.naive) 
    mse_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso)
    mse_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    mse_pca_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    mse_pca_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    
    naive_r2 = metrics.r2_score(predicts_master.y_test, predicts_master.naive) 
    r2_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso)
    r2_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    r2_pca_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    r2_pca_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    r2_mean_model = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    improvement_lasso = 100 - (mse_lasso/naive_mse)*100
    improvement_lasso_boosted = 100 - (mse_lasso_boosted /naive_mse)*100
    improvement_pca_lasso = 100 - (mse_pca_lasso/naive_mse)*100
    improvement_pca_lasso_boosted = 100 - (mse_pca_lasso_boosted/naive_mse)*100
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    results.loc[city,:]=[improvement_lasso, improvement_lasso_boosted, improvement_pca_lasso, improvement_pca_lasso_boosted, improvement_mean_model]
    results_mse.loc[city,:]=[naive_mse, mse_lasso, mse_lasso_boosted, mse_pca_lasso, mse_pca_lasso_boosted, mse_mean_model]
    results_r2.loc[city,:]=[naive_r2, r2_lasso, r2_lasso_boosted, r2_pca_lasso, r2_pca_lasso_boosted, r2_mean_model]
    
for city in ['hamburg', 'berlin', 'bremen']:
    country = 'DE'
    agg = get_training_data(city, country, radius, dens_type,  2015)
    
    # Lasso:
    predicts = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='predicts')
    predicts_master = predicts.copy()
    predicts_master = predicts_master.rename(columns = {'y_pred': 'y_pred_lasso'})

    # Lasso Boosted
    cols = get_best_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output ='used_columns')
    predicts = train_xgboost(agg, target, cols, 'predicts')
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred

    # PCA Lasso
    predicts = get_best_pca_lasso_model(agg=agg, target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius,output = 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso'] = predicts.y_pred

    # PCA Lasso boosted
    comps = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015,  density_type=dens_type, radius = radius, output = 'components')
    cols = get_best_pca_lasso_model(agg=agg,target=target, city=city, country=country, socio_year=2015, density_type=dens_type, radius = radius, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols, 'predicts')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso', 'y_pred_lasso_boosted', 'y_pred_pca_lasso', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)

    targets = ['unemployment_rate', 'income_levels', 'foreign_nationals']
    scaler = get_training_data(city, country, radius, dens_type, 2015, 'scaler')
    scaler_new = RobustScaler()
    scaler_new.center_, scaler_new.scale_ = scaler.center_[targets.index(target)], scaler.scale_[targets.index(target)]
    predicts_master = pd.DataFrame(scaler_new.inverse_transform(predicts_master), columns =predicts_master.columns)


    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.naive) 
    mse_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso)
    mse_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    mse_pca_lasso = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    mse_pca_lasso_boosted = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    
    naive_r2 = metrics.r2_score(predicts_master.y_test, predicts_master.naive) 
    r2_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso)
    r2_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_lasso_boosted)
    r2_pca_lasso = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso)
    r2_pca_lasso_boosted = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_pca_lasso_boosted)
    r2_mean_model = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    improvement_lasso = 100 - (mse_lasso/naive_mse)*100
    improvement_lasso_boosted = 100 - (mse_lasso_boosted /naive_mse)*100
    improvement_pca_lasso = 100 - (mse_pca_lasso/naive_mse)*100
    improvement_pca_lasso_boosted = 100 - (mse_pca_lasso_boosted/naive_mse)*100
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    results.loc[city,:]=[improvement_lasso, improvement_lasso_boosted, improvement_pca_lasso, improvement_pca_lasso_boosted, improvement_mean_model]
    results_mse.loc[city,:]=[naive_mse, mse_lasso, mse_lasso_boosted, mse_pca_lasso, mse_pca_lasso_boosted, mse_mean_model]
    results_r2.loc[city,:]=[naive_r2, r2_lasso, r2_lasso_boosted, r2_pca_lasso, r2_pca_lasso_boosted, r2_mean_model]

results.loc['mean'] = results.mean()
results

shape of training data (344, 241)
0.01
0.01
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.01
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.01
shape before pca: (344, 234)
number of pca components: 89
shape after pca: (344, 89)
0.01
shape of training data (344, 241)
shape of training data (176, 241)
0.1
0.1
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.1
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.1
shape before pca: (176, 234)
number of pca components: 78
shape after pca: (176, 78)
0.1
shape of training data (176, 241)
shape of training data (861, 241)
0.01
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861, 149)
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861, 149)
0.01
shape before pca: (861, 234)
number of pca components: 149
shape after pca: (861,

Unnamed: 0,improvement_lasso,improvement_lasso_boosted,improvement_pca_lasso,improvement_pca_lasso_boosted,improvement_mean_model
marseille,75.808083,75.410048,75.292222,63.616406,74.650453
lyon,59.479554,73.639557,60.051457,54.497191,66.901292
paris,71.828664,77.246015,71.402328,70.827758,76.240227
hamburg,30.486513,20.605822,25.780608,10.479869,24.572252
berlin,32.914038,30.439961,26.350371,41.664389,47.456212
bremen,48.217937,34.158441,50.52992,47.638577,50.071681
mean,53.122465,51.916641,51.567818,48.120698,56.648686


In [19]:
results_r2

Unnamed: 0,r2_naive,r2_lasso,r2_lasso_boosted,r2_pca_lasso,r2_pca_lasso_boosted,r2_mean_model
marseille,-0.000792,0.757889,0.753906,0.752726,0.635876,0.746304
lyon,-0.000858,0.594448,0.736169,0.600172,0.544581,0.668729
paris,-0.005564,0.716719,0.771194,0.712432,0.706654,0.76108
hamburg,-0.042585,0.275263,0.172249,0.2262,0.066677,0.213602
berlin,-0.001062,0.328428,0.303661,0.262721,0.416024,0.474004
bremen,-0.205356,0.375842,0.206375,0.40371,0.368859,0.398186


In [20]:
results_mse

Unnamed: 0,mse_naive,mse_lasso,mse_lasso_boosted,mse_pca_lasso,mse_pca_lasso_boosted,mse_mean_model
marseille,361248.317935,87392.893778,88830.788791,89256.432466,131435.120036,91574.81241
lyon,244401.878031,99032.73099,64425.41831,97634.990117,111209.719136,80893.863221
paris,840140.3386,236678.757885,191165.402656,240260.580683,245087.773446,199615.435048
hamburg,667237.806043,463820.266147,529747.970736,495219.846156,597312.156661,503282.452925
berlin,731046.943231,490429.8739,508516.535189,538413.361542,426460.699285,384119.757317
bremen,626827.178684,324584.043911,412712.784469,310091.905607,328215.630347,312964.271764


In [21]:
results_rmse = results_mse**(0.5)
results_rmse = results_rmse.rename(columns = {'mse_naive': 'rmse_naive','mse_lasso':'rmse_lasso', 'mse_lasso_boosted':'rmse_lasso_boosted', 'mse_pca_lasso':'rmse_pca_lasso', 'mse_pca_lasso_boosted':'rmse_pca_lasso_boosted', 'mse_mean_model':'rmse_mean_model'})
results_rmse

Unnamed: 0,rmse_naive,rmse_lasso,rmse_lasso_boosted,rmse_pca_lasso,rmse_pca_lasso_boosted,rmse_mean_model
marseille,601.039365,295.622891,298.044944,298.75815,362.539819,302.613305
lyon,494.370183,314.694663,253.821627,312.465982,333.481213,284.418465
paris,916.591697,486.496411,437.224659,490.16383,495.063403,446.783432
hamburg,816.846256,681.043513,727.837874,703.718584,772.859726,709.424029
berlin,855.012832,700.306985,713.103453,733.766558,653.039585,619.773957
bremen,791.724181,569.722778,642.42726,556.858964,572.901065,559.432098


In [22]:
output = pd.concat([results, results_r2, results_mse, results_rmse], axis = 1, join = 'inner')
output = output[['mse_naive', 'rmse_naive', 'r2_naive',
               'mse_lasso', 'rmse_lasso', 'r2_lasso', 'improvement_lasso',
               'mse_lasso_boosted', 'rmse_lasso_boosted', 'r2_lasso_boosted', 'improvement_lasso_boosted',
               'mse_pca_lasso', 'rmse_pca_lasso', 'r2_pca_lasso', 'improvement_pca_lasso',
               'mse_pca_lasso_boosted', 'rmse_pca_lasso_boosted', 'r2_pca_lasso_boosted', 'improvement_pca_lasso_boosted',
               'mse_mean_model', 'rmse_mean_model', 'r2_mean_model', 'improvement_mean_model']]
output.to_excel(f'output/results/results_{target}_{dens_type}_{radius}.xlsx')
output

Unnamed: 0,mse_naive,rmse_naive,r2_naive,mse_lasso,rmse_lasso,r2_lasso,improvement_lasso,mse_lasso_boosted,rmse_lasso_boosted,r2_lasso_boosted,...,r2_pca_lasso,improvement_pca_lasso,mse_pca_lasso_boosted,rmse_pca_lasso_boosted,r2_pca_lasso_boosted,improvement_pca_lasso_boosted,mse_mean_model,rmse_mean_model,r2_mean_model,improvement_mean_model
marseille,361248.317935,601.039365,-0.000792,87392.893778,295.622891,0.757889,75.808083,88830.788791,298.044944,0.753906,...,0.752726,75.292222,131435.120036,362.539819,0.635876,63.616406,91574.81241,302.613305,0.746304,74.650453
lyon,244401.878031,494.370183,-0.000858,99032.73099,314.694663,0.594448,59.479554,64425.41831,253.821627,0.736169,...,0.600172,60.051457,111209.719136,333.481213,0.544581,54.497191,80893.863221,284.418465,0.668729,66.901292
paris,840140.3386,916.591697,-0.005564,236678.757885,486.496411,0.716719,71.828664,191165.402656,437.224659,0.771194,...,0.712432,71.402328,245087.773446,495.063403,0.706654,70.827758,199615.435048,446.783432,0.76108,76.240227
hamburg,667237.806043,816.846256,-0.042585,463820.266147,681.043513,0.275263,30.486513,529747.970736,727.837874,0.172249,...,0.2262,25.780608,597312.156661,772.859726,0.066677,10.479869,503282.452925,709.424029,0.213602,24.572252
berlin,731046.943231,855.012832,-0.001062,490429.8739,700.306985,0.328428,32.914038,508516.535189,713.103453,0.303661,...,0.262721,26.350371,426460.699285,653.039585,0.416024,41.664389,384119.757317,619.773957,0.474004,47.456212
bremen,626827.178684,791.724181,-0.205356,324584.043911,569.722778,0.375842,48.217937,412712.784469,642.42726,0.206375,...,0.40371,50.52992,328215.630347,572.901065,0.368859,47.638577,312964.271764,559.432098,0.398186,50.071681
