## Transfer testing scheme
#### Option 1: Use all cities from both countries and predict one
#### Option 2: Use the cities from the same country to predict city in country
#### Option 3: Train a meta learner getting predictions of city models as input
#### Option 4: Train one city model and predict all other cities

In [1]:
from helper import get_training_data, get_csv_as_gpd, get_best_pca_lasso_model, get_best_lasso_model, train_xgboost
from sklearn.preprocessing import StandardScaler, RobustScaler
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import random
from scipy import stats

%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

## Generate master data set with all cities

In [2]:
#generate dataset with all cities
cities_fr = ['marseille', 'lyon', 'paris']
cities_de = ['berlin', 'hamburg', 'bremen']

agg_full = pd.DataFrame()

#load all cities in cities_fr list
if (len(cities_fr) !=0):
    for city in cities_fr:
        #load data
        agg = get_training_data(city, 'FR', 1000, 'count', 2015)
        agg_full = agg_full.append(agg)
#load all cities in cities_de list        
if (len(cities_de) !=0):
    for city in cities_de:
        #load data
        agg = get_training_data(city, 'DE', 1000, 'count', 2015)
        agg_full = agg_full.append(agg)
    
print(agg_full.shape)
agg_full=agg_full.dropna(axis = 1)
agg_full = agg_full.reset_index(drop = True)
print(agg_full.shape)
agg_full

shape of training data (344, 243)
shape of training data (176, 243)
shape of training data (861, 243)
shape of training data (122, 243)
shape of training data (125, 243)
shape of training data (129, 243)
(1757, 243)
(1757, 243)


Unnamed: 0,assigned_city,districts_admin_level_11_id,convenience_area_count_1000,camera_surveillance_area_count_1000,tourist_info_area_count_1000,attraction_area_count_1000,pharmacy_area_count_1000,post_box_area_count_1000,bank_area_count_1000,bakery_area_count_1000,...,jewish_min_dist,muslim_min_dist,christian_min_dist,buddhist_min_dist,dist_to_cc,foreign_nationals,unemployment_rate,income_levels,random_noise,geometry
0,marseille,FR-official_id-130020106-admin_level-11,-0.444444,-0.187919,-0.300000,0.0,-0.571429,-0.810811,-0.444444,-0.622222,...,2.841163,2.829058,0.903775,-0.176350,1.454689,-0.684105,-0.521390,1.029202,1.916586,"POLYGON ((5.54771 43.34452, 5.54753 43.34451, ..."
1,marseille,FR-official_id-130050702-admin_level-11,-0.333333,-0.161074,0.200000,0.0,-0.642857,-0.756757,-0.444444,-0.533333,...,4.198778,4.817203,1.997066,0.033113,2.122303,-0.669459,-0.619313,1.074703,2.538804,"POLYGON ((5.51860 43.28693, 5.51839 43.28692, ..."
2,marseille,FR-official_id-130710105-admin_level-11,-0.333333,-0.080537,-0.200000,0.0,-0.285714,-0.864865,-0.111111,-0.355556,...,3.466202,0.986584,1.253323,-1.716229,1.267491,-0.703629,-0.573965,0.504457,3.436804,"POLYGON ((5.35367 43.37689, 5.35352 43.37659, ..."
3,marseille,FR-official_id-130750102-admin_level-11,-0.444444,-0.107383,-0.400000,0.0,-0.428571,-0.108108,-0.333333,-0.444444,...,1.389558,1.039531,0.382529,0.341271,0.925596,-0.676339,-0.598862,0.612597,2.055997,"POLYGON ((5.47010 43.34248, 5.47001 43.34236, ..."
4,marseille,FR-official_id-130750104-admin_level-11,-0.444444,-0.187919,-0.400000,0.0,-0.571429,-0.324324,-0.444444,-0.444444,...,2.477888,2.057345,2.331364,0.880692,1.379049,-0.691519,-0.454218,0.928744,1.661400,"POLYGON ((5.46373 43.34745, 5.46349 43.34758, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1752,bremen,DE-official_id-04012000021-admin_level-11,-0.333333,0.000000,-0.470588,0.0,-0.500000,-0.571429,-0.500000,-0.166667,...,0.231711,0.428310,1.972046,1.890189,1.928484,-0.230593,0.339796,0.320310,-0.059541,"POLYGON ((8.63110 53.55557, 8.63111 53.55555, ..."
1753,bremen,DE-official_id-04012000022-admin_level-11,0.000000,3.000000,-0.352941,3.0,-0.500000,-0.428571,0.000000,-0.500000,...,-0.741527,0.497816,0.866531,2.180809,2.220471,-0.648528,0.037946,0.463917,2.279603,"POLYGON ((8.59095 53.59348, 8.59097 53.59336, ..."
1754,bremen,DE-official_id-04012000023-admin_level-11,-0.333333,0.000000,-0.470588,0.0,-1.000000,-0.714286,-0.500000,-0.500000,...,0.570909,0.510213,-0.230222,1.836003,1.873291,-0.565739,0.610655,-0.051318,2.660508,"POLYGON ((8.63762 53.50365, 8.63758 53.50363, ..."
1755,bremen,DE-official_id-04012000024-admin_level-11,0.333333,0.000000,0.000000,0.0,1.000000,0.000000,0.500000,0.333333,...,-0.720531,-0.654242,-0.060436,2.079863,2.117092,0.881500,1.951820,-1.320075,1.193018,"POLYGON ((8.58696 53.56894, 8.58672 53.56694, ..."


### Leave-one-city-out

In [3]:
#create dataframes
loo_results_impr = pd.DataFrame(columns = ['unemployment_rate', 'foreign_nationals', 'income_levels'])
loo_results_r2_score = pd.DataFrame(columns = ['unemployment_rate', 'foreign_nationals', 'income_levels'])
full_residuals_unemployment = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
full_residuals_income = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
full_residuals_fn = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])

#### Results

In [5]:
for city in ['marseille', 'lyon', 'paris']: 
    
    #set target and city
    country = 'FR'
    target = 'unemployment_rate'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full[agg_full.assigned_city!=target_city]
    test = agg_full[agg_full.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights
    
    #split data in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    
    #get selected columns
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    #train xgboost on subset
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #fit pca and reduce dataset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    
    #train lasso regression
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso chosen pca components and train xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset and make predictions
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    
    # compute M5 prediction based on mean
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #generate naive prediction
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    loo_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    loo_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.unemployment_rate-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.unemployment_rate-predicts.mean_model)**2
    
    full_residuals_unemployment = pd.concat([full_residuals_unemployment, residuals])

for city in ['berlin', 'hamburg', 'bremen']:
    
    #set target city and country
    country = 'DE'
    target_city = city
    
    #create train test split based on leaft-out city
    train = agg_full[agg_full.assigned_city!=target_city]
    test = agg_full[agg_full.assigned_city==target_city]

    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights    
    
    #split data in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    #get lasso selected features and train xgboost on subset
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply base and reduce data
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso regression on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso selcted pca components and train xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    #make prediction 
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    # create naive predictions
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    loo_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    loo_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)

    #compute residuals 
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.unemployment_rate-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.unemployment_rate-predicts.mean_model)**2
    
    full_residuals_unemployment = pd.concat([full_residuals_unemployment, residuals])

shape before pca: (1413, 236)
number of pca components: 173
shape after pca: (1413, 173)
0.1
shape before pca: (1413, 236)
number of pca components: 173
shape after pca: (1413, 173)
0.1
shape before pca: (1581, 236)
number of pca components: 169
shape after pca: (1581, 169)
0.2
shape before pca: (1581, 236)
number of pca components: 169
shape after pca: (1581, 169)
0.2
shape before pca: (896, 236)
number of pca components: 146
shape after pca: (896, 146)
0.01
shape before pca: (896, 236)
number of pca components: 146
shape after pca: (896, 146)
0.01
shape before pca: (1635, 236)
number of pca components: 167
shape after pca: (1635, 167)
0.1
shape before pca: (1635, 236)
number of pca components: 167
shape after pca: (1635, 167)
0.1
shape before pca: (1632, 236)
number of pca components: 171
shape after pca: (1632, 171)
0.1
shape before pca: (1632, 236)
number of pca components: 171
shape after pca: (1632, 171)
0.1
shape before pca: (1628, 236)
number of pca components: 169
shape after 

In [6]:
for city in ['marseille', 'lyon', 'paris']: 

    #set target and city
    country = 'FR'
    target = 'income_levels'
    target_city = city
    #create train dataset base on all cities but the target city
    train = agg_full[agg_full.assigned_city!=target_city]
    test = agg_full[agg_full.assigned_city==target_city]
   
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights
    
    #split in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    # get selected lasso features and train xgboost
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #fit and apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    # get lasso selected pca components
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset and predict
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    
    #get prediction for M5
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #get naive prediction
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    loo_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    loo_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.income_levels-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.income_levels-predicts.mean_model)**2
    
    full_residuals_income = pd.concat([full_residuals_income, residuals])
    
for city in ['berlin', 'hamburg', 'bremen']:
    # set target city
    country = 'DE'
    target_city = city
    #create train dataset base on all cities but the target city
    train = agg_full[agg_full.assigned_city!=target_city]
    test = agg_full[agg_full.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights    
    #split data in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    # get selected lasso features and train xgboost
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    
    #train lasso regression on reduced data
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso chosen pca components
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    
    #et prediction for M5
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #compute naive predictions
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    loo_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    loo_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.income_levels-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.income_levels-predicts.mean_model)**2
    
    full_residuals_income = pd.concat([full_residuals_income, residuals])

shape before pca: (1413, 236)
number of pca components: 173
shape after pca: (1413, 173)
0.01
shape before pca: (1413, 236)
number of pca components: 173
shape after pca: (1413, 173)
0.01
shape before pca: (1581, 236)
number of pca components: 169
shape after pca: (1581, 169)
0.1
shape before pca: (1581, 236)
number of pca components: 169
shape after pca: (1581, 169)
0.1
shape before pca: (896, 236)
number of pca components: 146
shape after pca: (896, 146)
0.01
shape before pca: (896, 236)
number of pca components: 146
shape after pca: (896, 146)
0.01
shape before pca: (1635, 236)
number of pca components: 167
shape after pca: (1635, 167)
0.1
shape before pca: (1635, 236)
number of pca components: 167
shape after pca: (1635, 167)
0.1
shape before pca: (1632, 236)
number of pca components: 171
shape after pca: (1632, 171)
0.1
shape before pca: (1632, 236)
number of pca components: 171
shape after pca: (1632, 171)
0.1
shape before pca: (1628, 236)
number of pca components: 169
shape afte

In [7]:
for city in ['marseille', 'lyon', 'paris']: 
    #set target city and target variable
    country = 'FR'
    target = 'foreign_nationals'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full[agg_full.assigned_city!=target_city]
    test = agg_full[agg_full.assigned_city==target_city]
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights
    
    #split data in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True)
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    # get lasso selected features and train xgboost
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso regression on pca componetns
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5,6,7,8,9,10]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso selected pca components
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset and make prediction
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    #get prediction of mean model
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #compute naive prediction
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    loo_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    loo_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.foreign_nationals-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.foreign_nationals-predicts.mean_model)**2
    
    full_residuals_fn = pd.concat([full_residuals_fn, residuals])
    
for city in ['berlin', 'hamburg', 'bremen']:
    #set target city
    country = 'DE'
    target_city = city
    #create train dataset base on all cities but the target city
    train = agg_full[agg_full.assigned_city!=target_city]
    test = agg_full[agg_full.assigned_city==target_city]
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights    
    
    #split data in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True)
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    #get lasso selected columns and make predictions
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso regression on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5,6,7,8,9,10]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso selected pca components
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    #apply lasso to testset and make predictions
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    
    #make M5 prediction
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)
    
    #compute naive prediction
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    loo_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    loo_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.foreign_nationals-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.foreign_nationals-predicts.mean_model)**2
    
    full_residuals_fn = pd.concat([full_residuals_fn, residuals])

shape before pca: (1413, 236)
number of pca components: 173
shape after pca: (1413, 173)
0.01
shape before pca: (1413, 236)
number of pca components: 173
shape after pca: (1413, 173)
0.01
shape before pca: (1581, 236)
number of pca components: 169
shape after pca: (1581, 169)
0.2
shape before pca: (1581, 236)
number of pca components: 169
shape after pca: (1581, 169)
0.2
shape before pca: (896, 236)
number of pca components: 146
shape after pca: (896, 146)
0.01
shape before pca: (896, 236)
number of pca components: 146
shape after pca: (896, 146)
0.01
shape before pca: (1635, 236)
number of pca components: 167
shape after pca: (1635, 167)
0.1
shape before pca: (1635, 236)
number of pca components: 167
shape after pca: (1635, 167)
0.1
shape before pca: (1632, 236)
number of pca components: 171
shape after pca: (1632, 171)
0.5
shape before pca: (1632, 236)
number of pca components: 171
shape after pca: (1632, 171)
0.5
shape before pca: (1628, 236)
number of pca components: 169
shape afte

In [8]:
loo_results_impr.loc['mean'] = loo_results_impr.mean()
loo_results_impr

Unnamed: 0,unemployment_rate,foreign_nationals,income_levels
marseille,25.676754,31.633758,19.183957
lyon,0.048349,-1.309613,-6.150607
paris,13.251059,13.005593,9.975073
berlin,40.128826,33.333416,20.37572
hamburg,39.076534,19.465109,20.52393
bremen,25.784639,5.899169,15.773473
mean,23.99436,17.004572,13.280258


In [9]:
loo_results_r2_score.loc['mean'] = loo_results_r2_score.mean()
loo_results_r2_score

Unnamed: 0,unemployment_rate,foreign_nationals,income_levels
marseille,0.245659,0.302784,0.191492
lyon,-0.004347,-0.013445,-0.063776
paris,0.127158,0.119054,0.099139
berlin,0.336146,0.320267,0.201038
hamburg,0.325853,0.19186,0.20353
bremen,0.228731,0.009735,0.131206
mean,0.209867,0.155042,0.127105


In [10]:
#TTest unemployment residuals
t_stat, p = stats.ttest_ind(full_residuals_unemployment['residuals_naive'], full_residuals_unemployment['residuals_m5'])
print(f'TTest: t={t_stat}, p={p}')

TTest: t=2.5451136095572804, p=0.01096663558897814


In [11]:
#TTest income residuals
t_stat, p = stats.ttest_ind(full_residuals_income['residuals_naive'], full_residuals_income['residuals_m5'])
print(f'TTest: t={t_stat}, p={p}')

TTest: t=2.1952284341199224, p=0.028212430401222476


In [12]:
#TTest foreign national residuals
t_stat, p = stats.ttest_ind(full_residuals_fn['residuals_naive'], full_residuals_fn['residuals_m5'])
print(f'TTest: t={t_stat}, p={p}')

TTest: t=2.576098193401887, p=0.010032728721579526


### Leave-one-city-in-country-out

In [13]:
#create dataframes
locico_results_impr = pd.DataFrame(columns = ['unemployment_rate', 'foreign_nationals', 'income_levels'])
locico_results_r2_score = pd.DataFrame(columns = ['unemployment_rate', 'foreign_nationals', 'income_levels'])
full_residuals_unemployment = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
full_residuals_income = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
full_residuals_fn = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])

### France

In [14]:
#get subset of french cities
print(agg_full.shape)
agg_full_fr = agg_full[agg_full.assigned_city.isin(['marseille', 'lyon', 'paris'])]
print(agg_full_fr.shape)

(1757, 243)
(1381, 243)


In [15]:
for city in ['marseille', 'lyon', 'paris']: 
    #set tartet variable and city
    country = 'FR'
    target = 'unemployment_rate'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full_fr[agg_full_fr.assigned_city!=target_city]
    test = agg_full_fr[agg_full_fr.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights

    #split data in x,y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    # get lasso selected columns and make predictions
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    if (len(cols_lasso)==0):
        boosted_lasso_predicts = clf1.predict(X_test)
    else:
        boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
        boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso regression on pca componetns
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get selected pca components and train xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    #apply pca on testset and make predictions
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    #get prediction for M5
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #get naive predictions
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    locico_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    locico_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.unemployment_rate-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.unemployment_rate-predicts.mean_model)**2
    
    full_residuals_unemployment = pd.concat([full_residuals_unemployment, residuals])

shape before pca: (1037, 236)
number of pca components: 146
shape after pca: (1037, 146)
0.1
shape before pca: (1037, 236)
number of pca components: 146
shape after pca: (1037, 146)
0.1
shape before pca: (1205, 236)
number of pca components: 144
shape after pca: (1205, 144)
0.2
shape before pca: (1205, 236)
number of pca components: 144
shape after pca: (1205, 144)
0.2
shape before pca: (520, 236)
number of pca components: 98
shape after pca: (520, 98)
0.01
shape before pca: (520, 236)
number of pca components: 98
shape after pca: (520, 98)
0.01


In [17]:
for city in ['marseille', 'lyon', 'paris']: 
    #set target variable and city
    country = 'FR'
    target = 'income_levels'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full_fr[agg_full_fr.assigned_city!=target_city]
    test = agg_full_fr[agg_full_fr.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights

    #split data in X and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    #train lasso regression
    alphas = [ 0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    if (len(cols_lasso)==0):
        boosted_lasso_predicts = clf1.predict(X_test)
    else:
        boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
        boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    
    #train lasso on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso chosen pca components and train xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    #apply pca to testset
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    #get prediction for M5
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #get naive prediction
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    locico_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    locico_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.income_levels-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.income_levels-predicts.mean_model)**2
    
    full_residuals_income = pd.concat([full_residuals_income, residuals])

shape before pca: (1037, 236)
number of pca components: 146
shape after pca: (1037, 146)
0.01
shape before pca: (1037, 236)
number of pca components: 146
shape after pca: (1037, 146)
0.01
shape before pca: (1205, 236)
number of pca components: 144
shape after pca: (1205, 144)
0.1
shape before pca: (1205, 236)
number of pca components: 144
shape after pca: (1205, 144)
0.1
shape before pca: (520, 236)
number of pca components: 98
shape after pca: (520, 98)
0.01
shape before pca: (520, 236)
number of pca components: 98
shape after pca: (520, 98)
0.01


In [18]:
for city in ['marseille', 'lyon', 'paris']: 
    #set target variable and city
    country = 'FR'
    target = 'foreign_nationals'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full_fr[agg_full_fr.assigned_city!=target_city]
    test = agg_full_fr[agg_full_fr.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each citycity]
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights
    #split data in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    # get selected lasso features and train xgboost
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca on trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso selected pca components and train xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
   
    #apply pca on testset and make predictions
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    #make prediction for M5
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #get naive predictions
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    locico_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    locico_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.foreign_nationals-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.foreign_nationals-predicts.mean_model)**2
    
    full_residuals_fn = pd.concat([full_residuals_fn, residuals])

shape before pca: (1037, 236)
number of pca components: 146
shape after pca: (1037, 146)
0.01
shape before pca: (1037, 236)
number of pca components: 146
shape after pca: (1037, 146)
0.01
shape before pca: (1205, 236)
number of pca components: 144
shape after pca: (1205, 144)
0.2
shape before pca: (1205, 236)
number of pca components: 144
shape after pca: (1205, 144)
0.2
shape before pca: (520, 236)
number of pca components: 98
shape after pca: (520, 98)
0.01
shape before pca: (520, 236)
number of pca components: 98
shape after pca: (520, 98)
0.01


### germany

In [19]:
#get data subset for german cities
print(agg_full.shape)
agg_full_de = agg_full[agg_full.assigned_city.isin(['berlin', 'hamburg', 'bremen'])]
print(agg_full_de.shape)

(1757, 243)
(376, 243)


In [20]:
target = 'unemployment_rate'
for city in ['berlin', 'hamburg', 'bremen']: 
    
    #set target city
    country = 'DE'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full_de[agg_full_de.assigned_city!=target_city]
    test = agg_full_de[agg_full_de.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights

    #split in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True)
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    #get lasso selected features and train xgboost
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso regression on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso selected pca components and train xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset and make predictions
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    
    #get M5 predictions
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #get naive predictions
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    locico_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    locico_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.unemployment_rate-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.unemployment_rate-predicts.mean_model)**2
    
    full_residuals_unemployment = pd.concat([full_residuals_unemployment, residuals])

shape before pca: (254, 236)
number of pca components: 91
shape after pca: (254, 91)
0.1
shape before pca: (254, 236)
number of pca components: 91
shape after pca: (254, 91)
0.1
shape before pca: (251, 236)
number of pca components: 106
shape after pca: (251, 106)
0.1
shape before pca: (251, 236)
number of pca components: 106
shape after pca: (251, 106)
0.1
shape before pca: (247, 236)
number of pca components: 95
shape after pca: (247, 95)
0.1
shape before pca: (247, 236)
number of pca components: 95
shape after pca: (247, 95)
0.1


In [21]:
target = 'income_levels'
for city in ['berlin', 'hamburg', 'bremen']: 
    
    country = 'DE'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full_de[agg_full_de.assigned_city!=target_city]
    test = agg_full_de[agg_full_de.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights

    #split in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    #get lasso selected features and train xgboost
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset 
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso regression on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso selected pca componenets and train xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset and make predictions
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    
    #get preidction of M5
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #get naive predictions
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    locico_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    locico_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.income_levels-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.income_levels-predicts.mean_model)**2
    
    full_residuals_income = pd.concat([full_residuals_income, residuals])

shape before pca: (254, 236)
number of pca components: 91
shape after pca: (254, 91)
0.1
shape before pca: (254, 236)
number of pca components: 91
shape after pca: (254, 91)
0.1
shape before pca: (251, 236)
number of pca components: 106
shape after pca: (251, 106)
0.1
shape before pca: (251, 236)
number of pca components: 106
shape after pca: (251, 106)
0.1
shape before pca: (247, 236)
number of pca components: 95
shape after pca: (247, 95)
0.2
shape before pca: (247, 236)
number of pca components: 95
shape after pca: (247, 95)
0.2


In [22]:
target = 'foreign_nationals'
for city in ['berlin', 'hamburg', 'bremen']: 
    
    country = 'DE'
    target_city = city
    
    #create train dataset base on all cities but the target city
    train = agg_full_de[agg_full_de.assigned_city!=target_city]
    test = agg_full_de[agg_full_de.assigned_city==target_city]
    
    #create weights for each neighborhood inverse to the number of neighborhoods in each city
    weights_train = []
    for train_city in train.assigned_city.unique().tolist():
        subset = train[train.assigned_city == train_city]
        weights = [1 - (len(subset)/len(train))]*len(subset)
        weights_train = weights_train + weights

    #split in x and y
    X_train = train.iloc[:,2:-5]
    y_train = train.loc[:,[target]]
    X_test = test.iloc[:,2:-5]
    y_test = test.loc[:,[target]].reset_index(drop = True) 
    
    # Boosted Lasso predicts
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf1 = GridSearchCV(lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf1.fit(X_train, y_train, sample_weight= weights_train)
    #get lasso selected features and trainxgboost
    cols_lasso = X_train.loc[:,clf1.best_estimator_.coef_!=0].columns.tolist()
    boosted_lasso = train_xgboost(train, target, cols_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    boosted_lasso_predicts = boosted_lasso.predict(X_test[cols_lasso])
    
    # PCA Lasso boosted
    #apply pca to trainset
    comps = get_best_pca_lasso_model(agg=train,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=False)
    pca = get_best_pca_lasso_model(agg=train ,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=False)
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(train.iloc[:,-5:].reset_index(drop = True))
    #train lasso regression on pca components
    alphas = [0.01, 0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5]
    pca_lasso = linear_model.Lasso(max_iter = 50000)
    parameters = {'alpha':alphas}
    clf2 = GridSearchCV(pca_lasso, parameters, scoring = ['neg_mean_squared_error'], refit ='neg_mean_squared_error')
    clf2.fit(reduced_data.iloc[:,:-5], reduced_data.loc[:,[target]], sample_weight= weights_train)
    #get lasso selected pca components and tarin xgboost
    cols_pca_lasso = reduced_data.iloc[:,:-5].loc[:,clf2.best_estimator_.coef_!=0].columns.tolist()
    boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier',learner_type = 'transfer', weights = weights_train)
    
    #apply pca to testset and make predictions
    reduced_test_data = pd.DataFrame(pca.transform(X_test))
    boosted_pca_lasso_predicts = boosted_pca_lasso.predict(reduced_test_data[cols_pca_lasso])
    
    #get predictions for M5
    predicts = pd.DataFrame({'boosted_lasso': boosted_lasso_predicts, 'boosted_pca_lasso': boosted_pca_lasso_predicts})
    predicts.loc[:,'mean_model'] = predicts.mean(axis = 1)

    #get naive predictions
    naive_pred = [y_train.mean().values[0]] * len(y_test)
    locico_results_impr.loc[city, target] = 100-(metrics.mean_squared_error(y_test, predicts.mean_model)/metrics.mean_squared_error(y_test, naive_pred)*100)
    locico_results_r2_score.loc[city, target] = metrics.r2_score(y_test, predicts.mean_model)
    
    #compute residuals
    residuals = pd.DataFrame(columns = ['residuals_naive', 'residuals_m5'])
    residuals.loc[:,'residuals_naive'] = (y_test.foreign_nationals-naive_pred)**2
    residuals.loc[:,'residuals_m5'] = (y_test.foreign_nationals-predicts.mean_model)**2
    
    full_residuals_fn = pd.concat([full_residuals_fn, residuals])

shape before pca: (254, 236)
number of pca components: 91
shape after pca: (254, 91)
0.1
shape before pca: (254, 236)
number of pca components: 91
shape after pca: (254, 91)
0.1
shape before pca: (251, 236)
number of pca components: 106
shape after pca: (251, 106)
0.5
shape before pca: (251, 236)
number of pca components: 106
shape after pca: (251, 106)
0.5
shape before pca: (247, 236)
number of pca components: 95
shape after pca: (247, 95)
0.5
shape before pca: (247, 236)
number of pca components: 95
shape after pca: (247, 95)
0.5


In [23]:
locico_results_impr.loc['mean'] = locico_results_impr.mean()
locico_results_impr

Unnamed: 0,unemployment_rate,foreign_nationals,income_levels
marseille,17.468847,-7.680231,26.136523
lyon,7.783741,-1.566205,4.194064
paris,11.678524,-1.900219,18.793013
berlin,32.501895,26.161711,26.487949
hamburg,33.993034,18.685804,15.942349
bremen,14.290642,-18.533477,13.959362
mean,19.619447,2.527897,17.585543


In [24]:
locico_results_r2_score.loc['mean'] = locico_results_r2_score.mean()
locico_results_r2_score

Unnamed: 0,unemployment_rate,foreign_nationals,income_levels
marseille,0.172597,-0.102659,0.261354
lyon,0.077668,-0.015891,0.041089
paris,0.114996,-0.03962,0.187774
berlin,0.324856,0.239313,0.233561
hamburg,0.338173,0.183433,0.157895
bremen,0.140561,-0.331548,0.116151
mean,0.194808,-0.011162,0.166304


In [25]:
#TTest on residuals of unemployment
t_stat, p = stats.ttest_ind(full_residuals_unemployment['residuals_naive'], full_residuals_unemployment['residuals_m5'])
print(f'TTest: t={t_stat}, p={p}')

TTest: t=2.1245393944445814, p=0.033694728357986974


In [26]:
#TTest on residuals of income
t_stat, p = stats.ttest_ind(full_residuals_income['residuals_naive'], full_residuals_income['residuals_m5'])
print(f'TTest: t={t_stat}, p={p}')

TTest: t=4.020724673492724, p=5.895844927935324e-05


In [27]:
#TTest on residuals of foreign national rate
t_stat, p = stats.ttest_ind(full_residuals_fn['residuals_naive'], full_residuals_fn['residuals_m5'])
print(f'TTest: t={t_stat}, p={p}')

TTest: t=-0.004499546435017125, p=0.9964101490373795


### All to all city prediction

In [28]:
# create outout dataframe
improvement_matrix = pd.DataFrame(columns = ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen'])
r2_score_matrix = pd.DataFrame(columns = ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen'])
#set target
target = 'unemployment_rate'
for train_city in ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen']:
    
    predicts_master = pd.DataFrame(columns = ['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted', 'y_pred_mean_model', 'y_pred_naive'])
    #load data depending on country
    if (train_city in ['marseille', 'lyon', 'paris']):
        country = 'FR'
    else:
        country = 'DE'
    agg_subset = agg_full[agg_full.assigned_city==train_city].reset_index(drop = True)

    
    # train lasso boosted (M2)
    cols_lasso = get_best_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output ='used_columns')
    predicts = train_xgboost(agg_subset, target, cols_lasso, 'predicts')
    lasso_xgb = train_xgboost(agg_subset, target, cols_lasso, 'classifier')
    
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred
    predicts_master.loc[:, 'y_pred_naive'] = predicts.naive

    # PCA Lasso boosted (M4)
    comps = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components')
    pca = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier')
    cols_pca_lasso = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg_subset.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols_pca_lasso, 'predicts')
    pca_lasso_xgb = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #get predictions of M5
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_test'] = predicts.y_test
    
    #compute performance metrics
    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_naive) 
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    
    improvement_matrix.loc[train_city, train_city] = improvement_mean_model
    r2_score_matrix.loc[train_city, train_city] = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    # used the trained model and apply it as it is to the other cities
    for target_city in ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen']:
        #make sure target city is different from train city
        if(target_city != train_city):
            
            #determine country of city 
            if (target_city in ['marseille', 'lyon', 'paris']):
                country = 'FR'
            else:
                country = 'DE'
            
            predicts_master = pd.DataFrame(columns = ['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted', 'y_pred_mean_model', 'y_pred_naive'])
            #filter testdata
            test_data = agg_full[agg_full.assigned_city==target_city].reset_index(drop = True)

            #get predictions of M2
            predicts_master.loc[:,'y_pred_lasso_boosted'] = lasso_xgb.predict(test_data.iloc[:,2:-5][cols_lasso])

            #get predictions of M4
            reduced_data = pd.DataFrame(pca.transform(test_data.iloc[:,2:-5]))
            reduced_data = reduced_data.join(test_data.iloc[:,-5:])
            predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = pca_lasso_xgb.predict(reduced_data.iloc[:,:-5][cols_pca_lasso])

            #combine M2 and M4 predictions to M5
            predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
            predicts_master.loc[:, 'y_test'] = test_data[target]
            predicts_master.loc[:, 'y_pred_naive'] = test_data[target].mean()

            #compute performance metrics
            naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_naive) 
            mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
            improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100

            improvement_matrix.loc[train_city, target_city] = improvement_mean_model
            r2_score_matrix.loc[train_city, target_city] = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

0.2
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.2
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.2
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.2
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
0.1
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.1
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.1
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.1
0.1
shape before pca: (122, 236)
number of pca components: 71
shape after pca: (122, 71)
0.1
shape before pca: (122, 236)
number of pca components: 71
shape after pca: (122, 71)

In [29]:
improvement_matrix

Unnamed: 0,lyon,paris,marseille,berlin,hamburg,bremen
lyon,42.387262,-5.008299,2.008875,-19.230927,-27.25256,-17.609691
paris,5.084881,36.334713,8.570473,-110.828487,-104.829046,-97.911119
marseille,-13.058004,14.278103,46.929571,25.176568,0.01777,5.574265
berlin,-19.014588,-8.327019,12.433117,38.398618,27.555578,-7.580129
hamburg,-21.631801,-20.104518,-10.837818,38.137327,50.256995,31.791296
bremen,-17.974252,-28.299383,-21.523371,9.791983,25.823198,56.940528


In [30]:
r2_score_matrix

Unnamed: 0,lyon,paris,marseille,berlin,hamburg,bremen
lyon,0.422793,-0.050083,0.020089,-0.192309,-0.272526,-0.176097
paris,0.050849,0.35436,0.085705,-1.108285,-1.04829,-0.979111
marseille,-0.13058,0.142781,0.468572,0.251766,0.000178,0.055743
berlin,-0.190146,-0.08327,0.124331,0.383826,0.275556,-0.075801
hamburg,-0.216318,-0.201045,-0.108378,0.381373,0.440679,0.317913
bremen,-0.179743,-0.282994,-0.215234,0.09792,0.258232,0.550801


In [31]:
# create outout dataframe
improvement_matrix = pd.DataFrame(columns = ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen'])
r2_score_matrix = pd.DataFrame(columns = ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen'])
#set target
target = 'foreign_nationals'
for train_city in ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen']:
    
    predicts_master = pd.DataFrame(columns = ['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted', 'y_pred_mean_model', 'y_pred_naive'])
    #load data depending on country
    if (train_city in ['marseille', 'lyon', 'paris']):
        country = 'FR'
    else:
        country = 'DE'
    agg_subset = agg_full[agg_full.assigned_city==train_city].reset_index(drop = True)

    
    # train lasso boosted (M2)
    cols_lasso = get_best_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output ='used_columns')
    predicts = train_xgboost(agg_subset, target, cols_lasso, 'predicts')
    lasso_xgb = train_xgboost(agg_subset, target, cols_lasso, 'classifier')
    
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred
    predicts_master.loc[:, 'y_pred_naive'] = predicts.naive

    # PCA Lasso boosted (M4)
    comps = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components')
    pca = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier')
    cols_pca_lasso = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg_subset.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols_pca_lasso, 'predicts')
    pca_lasso_xgb = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #get predictions of M5
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_test'] = predicts.y_test
    
    #compute performance metrics
    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_naive) 
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    
    improvement_matrix.loc[train_city, train_city] = improvement_mean_model
    r2_score_matrix.loc[train_city, train_city] = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    # used the trained model and apply it as it is to the other cities
    for target_city in ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen']:
        #make sure target city is different from train city
        if(target_city != train_city):
            
            #determine country of city 
            if (target_city in ['marseille', 'lyon', 'paris']):
                country = 'FR'
            else:
                country = 'DE'
            
            predicts_master = pd.DataFrame(columns = ['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted', 'y_pred_mean_model', 'y_pred_naive'])
            #filter testdata
            test_data = agg_full[agg_full.assigned_city==target_city].reset_index(drop = True)

            #get predictions of M2
            predicts_master.loc[:,'y_pred_lasso_boosted'] = lasso_xgb.predict(test_data.iloc[:,2:-5][cols_lasso])

            #get predictions of M4
            reduced_data = pd.DataFrame(pca.transform(test_data.iloc[:,2:-5]))
            reduced_data = reduced_data.join(test_data.iloc[:,-5:])
            predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = pca_lasso_xgb.predict(reduced_data.iloc[:,:-5][cols_pca_lasso])

            #combine M2 and M4 predictions to M5
            predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
            predicts_master.loc[:, 'y_test'] = test_data[target]
            predicts_master.loc[:, 'y_pred_naive'] = test_data[target].mean()

            #compute performance metrics
            naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_naive) 
            mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
            improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100

            improvement_matrix.loc[train_city, target_city] = improvement_mean_model
            r2_score_matrix.loc[train_city, target_city] = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

0.1
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.2
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.2
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.2
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
0.01
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.01
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.01
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.01
0.1
shape before pca: (122, 236)
number of pca components: 71
shape after pca: (122, 71)
0.1
shape before pca: (122, 236)
number of pca components: 71
shape after pca: (122,

In [32]:
improvement_matrix

Unnamed: 0,lyon,paris,marseille,berlin,hamburg,bremen
lyon,27.669349,3.678184,4.345853,-10.638002,-1.56839,-30.527752
paris,5.722788,35.326112,-4.636117,-15.310389,-14.889929,-52.978752
marseille,-18.435029,-14.281441,72.405118,40.503535,10.713761,-18.719399
berlin,-35.181444,0.877989,20.654546,67.603515,-0.428588,-21.65871
hamburg,-20.895484,-40.023319,4.035169,1.4879,25.432159,-39.840821
bremen,-28.167978,-12.88348,11.717505,22.545811,6.403182,32.320905


In [33]:
r2_score_matrix

Unnamed: 0,lyon,paris,marseille,berlin,hamburg,bremen
lyon,0.275161,0.036782,0.043459,-0.10638,-0.015684,-0.305278
paris,0.057228,0.339217,-0.046361,-0.153104,-0.148899,-0.529788
marseille,-0.18435,-0.142814,0.716842,0.405035,0.107138,-0.187194
berlin,-0.351814,0.00878,0.206545,0.66505,-0.004286,-0.216587
hamburg,-0.208955,-0.400233,0.040352,0.014879,0.059483,-0.398408
bremen,-0.28168,-0.128835,0.117175,0.225458,0.064032,0.314615


In [34]:
# create outout dataframe
improvement_matrix = pd.DataFrame(columns = ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen'])
r2_score_matrix = pd.DataFrame(columns = ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen'])
#set target
target = 'income_levels'
for train_city in ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen']:
    
    predicts_master = pd.DataFrame(columns = ['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted', 'y_pred_mean_model', 'y_pred_naive'])
    #load data depending on country
    if (train_city in ['marseille', 'lyon', 'paris']):
        country = 'FR'
    else:
        country = 'DE'
    agg_subset = agg_full[agg_full.assigned_city==train_city].reset_index(drop = True)

    
    # train lasso boosted (M2)
    cols_lasso = get_best_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output ='used_columns')
    predicts = train_xgboost(agg_subset, target, cols_lasso, 'predicts')
    lasso_xgb = train_xgboost(agg_subset, target, cols_lasso, 'classifier')
    
    predicts_master.loc[:, 'y_pred_lasso_boosted'] = predicts.y_pred
    predicts_master.loc[:, 'y_pred_naive'] = predicts.naive

    # PCA Lasso boosted (M4)
    comps = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components')
    pca = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier')
    cols_pca_lasso = get_best_pca_lasso_model(agg=agg_subset, target=target, city=train_city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'used_columns')
    reduced_data = pd.DataFrame(comps)
    reduced_data = reduced_data.join(agg_subset.iloc[:,-5:])
    predicts = train_xgboost(reduced_data, target, cols_pca_lasso, 'predicts')
    pca_lasso_xgb = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier')
    predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = predicts.y_pred

    #get predictions of M5
    predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
    predicts_master.loc[:, 'y_test'] = predicts.y_test
    
    #compute performance metrics
    naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_naive) 
    mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
    improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100
    
    improvement_matrix.loc[train_city, train_city] = improvement_mean_model
    r2_score_matrix.loc[train_city, train_city] = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

    # used the trained model and apply it as it is to the other cities
    for target_city in ['lyon', 'paris', 'marseille', 'berlin', 'hamburg', 'bremen']:
        #make sure target city is different from train city
        if(target_city != train_city):
            
            #determine country of city 
            if (target_city in ['marseille', 'lyon', 'paris']):
                country = 'FR'
            else:
                country = 'DE'
            
            predicts_master = pd.DataFrame(columns = ['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted', 'y_pred_mean_model', 'y_pred_naive'])
            #filter testdata
            test_data = agg_full[agg_full.assigned_city==target_city].reset_index(drop = True)

            #get predictions of M2
            predicts_master.loc[:,'y_pred_lasso_boosted'] = lasso_xgb.predict(test_data.iloc[:,2:-5][cols_lasso])

            #get predictions of M4
            reduced_data = pd.DataFrame(pca.transform(test_data.iloc[:,2:-5]))
            reduced_data = reduced_data.join(test_data.iloc[:,-5:])
            predicts_master.loc[:,'y_pred_pca_lasso_boosted'] = pca_lasso_xgb.predict(reduced_data.iloc[:,:-5][cols_pca_lasso])

            #combine M2 and M4 predictions to M5
            predicts_master.loc[:, 'y_pred_mean_model'] = predicts_master[['y_pred_lasso_boosted', 'y_pred_pca_lasso_boosted']].mean(axis = 1)
            predicts_master.loc[:, 'y_test'] = test_data[target]
            predicts_master.loc[:, 'y_pred_naive'] = test_data[target].mean()

            #compute performance metrics
            naive_mse = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_naive) 
            mse_mean_model = metrics.mean_squared_error(predicts_master.y_test, predicts_master.y_pred_mean_model)
            improvement_mean_model = 100 - (mse_mean_model/naive_mse)*100

            improvement_matrix.loc[train_city, target_city] = improvement_mean_model
            r2_score_matrix.loc[train_city, target_city] = metrics.r2_score(predicts_master.y_test, predicts_master.y_pred_mean_model)

0.1
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.1
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.1
shape before pca: (176, 236)
number of pca components: 70
shape after pca: (176, 70)
0.1
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
shape before pca: (861, 236)
number of pca components: 134
shape after pca: (861, 134)
0.01
0.01
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.01
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.01
shape before pca: (344, 236)
number of pca components: 69
shape after pca: (344, 69)
0.01
0.1
shape before pca: (122, 236)
number of pca components: 71
shape after pca: (122, 71)
0.1
shape before pca: (122, 236)
number of pca components: 71
shape after pca: (122,

In [35]:
improvement_matrix

Unnamed: 0,lyon,paris,marseille,berlin,hamburg,bremen
lyon,57.652806,-18.648556,-6.881163,-33.784398,-16.021425,-19.262632
paris,-1.703752,74.625626,18.331365,7.726413,-2.025867,-9.452516
marseille,-3.09627,24.203725,75.309903,0.537203,-0.14184,-2.34901
berlin,-12.370467,4.711074,2.237851,29.731272,12.085897,-2.648139
hamburg,-22.138298,4.940516,6.012667,22.020487,47.218826,11.799119
bremen,-31.671434,-4.316979,-17.581762,8.666853,-14.314297,45.940758


In [36]:
r2_score_matrix

Unnamed: 0,lyon,paris,marseille,berlin,hamburg,bremen
lyon,0.576165,-0.186486,-0.068812,-0.337844,-0.160214,-0.192626
paris,-0.017038,0.744844,0.183314,0.077264,-0.020259,-0.094525
marseille,-0.030963,0.242037,0.752903,0.005372,-0.001418,-0.02349
berlin,-0.123705,0.047111,0.022379,0.296566,0.120859,-0.026481
hamburg,-0.221383,0.049405,0.060127,0.220205,0.449712,0.117991
bremen,-0.316714,-0.04317,-0.175818,0.086669,-0.143143,0.348394


## Meta Learner trained on all cities

In [37]:
meta_learner_results = pd.DataFrame(columns = ['unemployment_rate', 'foreign_nationals', 'income_levels'])
meta_learner_r2_score = pd.DataFrame(columns = ['unemployment_rate', 'foreign_nationals', 'income_levels'])

In [38]:
#set target
target = 'unemployment_rate'

for target_city in ['marseille', 'lyon', 'paris', 'berlin', 'hamburg', 'bremen']:

    train_full = pd.DataFrame(columns = agg_full.columns)
    meta_train_full = pd.DataFrame(columns = agg_full.columns)
    test_full = pd.DataFrame(columns = agg_full.columns)

    # filter data for respective city
    for city in ['marseille', 'lyon', 'paris', 'berlin', 'hamburg', 'bremen']:

        #get city subset which is used for training
        subset = agg_full[agg_full.assigned_city == city]

        if (target_city != city):
        # create three subsets - training data, meta_training_data, test_data. If city is "traget city" is will be used as testset
            train, meta_train = train_test_split(subset, test_size=30, random_state=41)
            # append created datasets
            train_full = train_full.append(train)
            meta_train_full = meta_train_full.append(meta_train)

        else:
            test_full = subset

    #reset index for all three subsets
    train_full = train_full.reset_index(drop = True)
    meta_train_full = meta_train_full.reset_index(drop = True)
    test_full = test_full.reset_index(drop = True)

    print('shape of training data: '+str(train_full.shape))
    print('shape of meta training data: '+str(meta_train_full.shape))
    print('shape of test data: '+str(test_full.shape))

    cities = ['marseille', 'lyon', 'paris', 'hamburg', 'bremen', 'berlin', target]
    cities.remove(target_city)

    #create dataframe to fill for meta learner training
    meta_learner_train = pd.DataFrame(columns = cities)
    meta_learner_test = pd.DataFrame(columns = cities)

    # split data in feature and target
    X_meta_train_full = meta_train_full.iloc[:,2:-5]
    y_meta_train_full = meta_train_full[[target]]

    X_test_full = test_full.iloc[:,2:-5]
    y_test_full = test_full[[target]]

    for city in ['marseille', 'lyon', 'paris']:
        country = 'FR'

        if(city != target_city):
            #filter out data of respective city
            city_training_data = train_full[train_full.assigned_city == city]

            # Lasso Boosted
            #get columns lasso model selects
            cols_lasso = get_best_lasso_model(agg=city_training_data, target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output ='used_columns',meta_learner= True)
            # get xgboost classifier
            boosted_lasso = train_xgboost(city_training_data, target, cols_lasso, 'classifier', learner_type = 'meta')
            #make prediction for meta_train data and test data
            boosted_lasso_predicts_meta = boosted_lasso.predict(X_meta_train_full[cols_lasso])
            boosted_lasso_predicts_test = boosted_lasso.predict(X_test_full[cols_lasso])

            # PCA Lasso boosted
            # get the components pca returns
            comps = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=True)
            # get columns to selct the most important components from pca
            cols_pca_lasso = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'used_columns', meta_learner=True)
            # get fitted pca model
            pca = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=True)
            #create reduced training data based on pca components
            reduced_data = pd.DataFrame(comps)
            reduced_data = reduced_data.join(city_training_data.iloc[:,-5:].reset_index(drop = True))
            #get trained xgboost
            boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'meta')

            #apply pca to meta_train data
            print('meta full: '+str(meta_train_full.shape))
            reduced_X_meta_train_full = pd.DataFrame(pca.transform(X_meta_train_full))
            print('reduced pca: '+str(reduced_X_meta_train_full.shape))

            # apply pca to test data
            print('test full: '+str(test_full.shape))
            reduced_X_test_full = pd.DataFrame(pca.transform(X_test_full))
            print('reduced pca: '+str(reduced_X_test_full.shape))

            # make predictions for meta_train data and test data
            boosted_pca_lasso_predicts_meta = boosted_pca_lasso.predict(reduced_X_meta_train_full[cols_pca_lasso])
            boosted_pca_lasso_predicts_test = boosted_pca_lasso.predict(reduced_X_test_full[cols_pca_lasso])

            #store predictions for meta_train in new dataframe and get the mean value of both predictions
            city_result_meta = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_meta.loc[:,'boosted_lasso'] = boosted_lasso_predicts_meta
            city_result_meta.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_meta
            city_result_meta.loc[:, 'mean_model'] = city_result_meta[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            #store predictions for test_data in new dataframe and get the mean value of both predictions
            city_result_test = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_test.loc[:,'boosted_lasso'] = boosted_lasso_predicts_test
            city_result_test.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_test
            city_result_test.loc[:, 'mean_model'] = city_result_test[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            # add prediction to full data sets for meta_training and test
            meta_learner_train.loc[:,city] = city_result_meta['mean_model']
            meta_learner_test.loc[:,city] = city_result_test['mean_model']

    for city in ['bremen', 'hamburg', 'berlin']:

        if (city != target_city):

            country = 'DE'

            city_training_data = train_full[train_full.assigned_city == city]

            # Lasso Boosted
            cols_lasso = get_best_lasso_model(agg=city_training_data, target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output ='used_columns',meta_learner= True)
            boosted_lasso = train_xgboost(city_training_data, target, cols_lasso, 'classifier', learner_type = 'meta')
            boosted_lasso_predicts_meta = boosted_lasso.predict(X_meta_train_full[cols_lasso])
            boosted_lasso_predicts_test = boosted_lasso.predict(X_test_full[cols_lasso])

            # PCA Lasso boosted
            comps = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components', meta_learner=True)
            cols_pca_lasso = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'used_columns', meta_learner=True)
            pca = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=True)
            reduced_data = pd.DataFrame(comps)
            reduced_data = reduced_data.join(city_training_data.iloc[:,-5:].reset_index(drop = True))
            boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'meta')

            print('meta full: '+str(meta_train_full.shape))
            reduced_X_meta_train_full = pd.DataFrame(pca.transform(X_meta_train_full))
            print('reduced pca: '+str(reduced_X_meta_train_full.shape))

            print('test full: '+str(test_full.shape))
            reduced_X_test_full = pd.DataFrame(pca.transform(X_test_full))
            print('reduced pca: '+str(reduced_X_test_full.shape))

            boosted_pca_lasso_predicts_meta = boosted_pca_lasso.predict(reduced_X_meta_train_full[cols_pca_lasso])
            boosted_pca_lasso_predicts_test = boosted_pca_lasso.predict(reduced_X_test_full[cols_pca_lasso])

            city_result_meta = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_meta.loc[:,'boosted_lasso'] = boosted_lasso_predicts_meta
            city_result_meta.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_meta
            city_result_meta.loc[:, 'mean_model'] = city_result_meta[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            city_result_test = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_test.loc[:,'boosted_lasso'] = boosted_lasso_predicts_test
            city_result_test.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_test
            city_result_test.loc[:, 'mean_model'] = city_result_test[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            meta_learner_train.loc[:,city] = city_result_meta['mean_model']
            meta_learner_test.loc[:,city] = city_result_test['mean_model']   

    # add the target column to the meta_train set and test set and add cities back to dataframe    
    meta_learner_train.loc[:,target]= y_meta_train_full
    meta_learner_test.loc[:,target] = y_test_full
    meta_learner_train.loc[:, 'assigned_city'] = meta_train_full.assigned_city
    meta_learner_test.loc[:, 'assigned_city'] = test_full.assigned_city


    clf = LinearRegression()
    clf.fit(meta_learner_train.iloc[:,:-2],meta_learner_train.loc[:,[target]])

    #make prediction for test data
    y_pred = clf.predict(meta_learner_test.iloc[:,:-2])
    naive_pred = [meta_learner_train[[target]].mean().values[0]] * len(meta_learner_test[[target]])

    meta_learner_results.loc[target_city, target] = (100-(metrics.mean_squared_error(meta_learner_test[[target]], y_pred)/metrics.mean_squared_error(meta_learner_test[[target]], naive_pred)*100))
    meta_learner_r2_score.loc[target_city, target] = metrics.r2_score(meta_learner_test[[target]], y_pred)

shape of training data: (1263, 243)
shape of meta training data: (150, 243)
shape of test data: (344, 243)
0.2
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
meta full: (150, 243)
reduced pca: (150, 66)
test full: (344, 243)
reduced pca: (344, 66)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
meta full: (150, 243)
reduced pca: (150, 133)
test full: (344, 243)
reduced pca: (344, 133)
0.2
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.1
shape before pca: (99, 236)
number of pca components: 53
shape after

shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
meta full: (150, 243)
reduced pca: (150, 66)
test full: (125, 243)
reduced pca: (125, 66)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
meta full: (150, 243)
reduced pca: (150, 133)
test full: (125, 243)
reduced pca: (125, 133)
0.2
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.1
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.1
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.1
meta full: (150, 243)
reduced pca: (150, 53)
test full: (125, 243)
reduced pca: (125, 53)
0.1
shape before pca: (92, 236)
number of pca components: 58
shape after pca: (92, 58)
0.1
sh

In [39]:
target = 'foreign_nationals'

for target_city in ['marseille', 'lyon', 'paris', 'berlin', 'hamburg', 'bremen']:

    train_full = pd.DataFrame(columns = agg_full.columns)
    meta_train_full = pd.DataFrame(columns = agg_full.columns)
    test_full = pd.DataFrame(columns = agg_full.columns)

    # filter data for respective city
    for city in ['marseille', 'lyon', 'paris', 'berlin', 'hamburg', 'bremen']:


        subset = agg_full[agg_full.assigned_city == city]

        if (target_city != city):
        # create three subsets - training data, meta_training_data, test_data
            train, meta_train = train_test_split(subset, test_size=30, random_state=41)
            # append created datasets
            train_full = train_full.append(train)
            meta_train_full = meta_train_full.append(meta_train)

        else:
            test_full = subset

    #reset index for all three subsets
    train_full = train_full.reset_index(drop = True)
    meta_train_full = meta_train_full.reset_index(drop = True)
    test_full = test_full.reset_index(drop = True)

    print('shape of training data: '+str(train_full.shape))
    print('shape of meta training data: '+str(meta_train_full.shape))
    print('shape of test data: '+str(test_full.shape))

    cities = ['marseille', 'lyon', 'paris', 'hamburg', 'bremen', 'berlin', target]
    cities.remove(target_city)

    meta_learner_train = pd.DataFrame(columns = cities)
    meta_learner_test = pd.DataFrame(columns = cities)

    # split data in feature and target
    X_meta_train_full = meta_train_full.iloc[:,2:-5]
    y_meta_train_full = meta_train_full[[target]]

    X_test_full = test_full.iloc[:,2:-5]
    y_test_full = test_full[[target]]

    for city in ['marseille', 'lyon', 'paris']:
        country = 'FR'

        if(city != target_city):
            #filter out data of respective city
            city_training_data = train_full[train_full.assigned_city == city]

            # Lasso Boosted
            #get columns lasso model selects
            cols_lasso = get_best_lasso_model(agg=city_training_data, target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output ='used_columns',meta_learner= True)
            # get xgboost classifier
            boosted_lasso = train_xgboost(city_training_data, target, cols_lasso, 'classifier', learner_type = 'meta')
            #make prediction for meta_train data and test data
            boosted_lasso_predicts_meta = boosted_lasso.predict(X_meta_train_full[cols_lasso])
            boosted_lasso_predicts_test = boosted_lasso.predict(X_test_full[cols_lasso])

            # PCA Lasso boosted
            # get the components pca returns
            comps = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components', meta_learner=True)
            # get columns to selct the most important components from pca
            cols_pca_lasso = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'used_columns', meta_learner=True)
            # get fitted pca model
            pca = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=True)
            #create reduced training data based on pca components
            reduced_data = pd.DataFrame(comps)
            reduced_data = reduced_data.join(city_training_data.iloc[:,-5:].reset_index(drop = True))
            #get trained xgboost
            boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'meta')

            #apply pca to meta_train data
            print('meta full: '+str(meta_train_full.shape))
            reduced_X_meta_train_full = pd.DataFrame(pca.transform(X_meta_train_full))
            print('reduced pca: '+str(reduced_X_meta_train_full.shape))

            # apply pca to test data
            print('test full: '+str(test_full.shape))
            reduced_X_test_full = pd.DataFrame(pca.transform(X_test_full))
            print('reduced pca: '+str(reduced_X_test_full.shape))

            # make predictions for meta_train data and test data
            boosted_pca_lasso_predicts_meta = boosted_pca_lasso.predict(reduced_X_meta_train_full[cols_pca_lasso])
            boosted_pca_lasso_predicts_test = boosted_pca_lasso.predict(reduced_X_test_full[cols_pca_lasso])

            #store predictions for meta_train in new dataframe and get the mean value of both predictions
            city_result_meta = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_meta.loc[:,'boosted_lasso'] = boosted_lasso_predicts_meta
            city_result_meta.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_meta
            city_result_meta.loc[:, 'mean_model'] = city_result_meta[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            #store predictions for test_data in new dataframe and get the mean value of both predictions
            city_result_test = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_test.loc[:,'boosted_lasso'] = boosted_lasso_predicts_test
            city_result_test.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_test
            city_result_test.loc[:, 'mean_model'] = city_result_test[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            # add prediction to full data sets for meta_training and test
            meta_learner_train.loc[:,city] = city_result_meta['mean_model']
            meta_learner_test.loc[:,city] = city_result_test['mean_model']

    for city in ['bremen', 'hamburg', 'berlin']:

        if (city != target_city):

            country = 'DE'

            city_training_data = train_full[train_full.assigned_city == city]

            # Lasso Boosted
            cols_lasso = get_best_lasso_model(agg=city_training_data, target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output ='used_columns',meta_learner= True)
            boosted_lasso = train_xgboost(city_training_data, target, cols_lasso, 'classifier', learner_type = 'meta')
            boosted_lasso_predicts_meta = boosted_lasso.predict(X_meta_train_full[cols_lasso])
            boosted_lasso_predicts_test = boosted_lasso.predict(X_test_full[cols_lasso])

            # PCA Lasso boosted
            comps = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components', meta_learner=True)
            cols_pca_lasso = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'used_columns', meta_learner=True)
            pca = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=True)
            reduced_data = pd.DataFrame(comps)
            reduced_data = reduced_data.join(city_training_data.iloc[:,-5:].reset_index(drop = True))
            boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'meta')

            print('meta full: '+str(meta_train_full.shape))
            reduced_X_meta_train_full = pd.DataFrame(pca.transform(X_meta_train_full))
            print('reduced pca: '+str(reduced_X_meta_train_full.shape))

            print('test full: '+str(test_full.shape))
            reduced_X_test_full = pd.DataFrame(pca.transform(X_test_full))
            print('reduced pca: '+str(reduced_X_test_full.shape))

            boosted_pca_lasso_predicts_meta = boosted_pca_lasso.predict(reduced_X_meta_train_full[cols_pca_lasso])
            boosted_pca_lasso_predicts_test = boosted_pca_lasso.predict(reduced_X_test_full[cols_pca_lasso])

            city_result_meta = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_meta.loc[:,'boosted_lasso'] = boosted_lasso_predicts_meta
            city_result_meta.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_meta
            city_result_meta.loc[:, 'mean_model'] = city_result_meta[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            city_result_test = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_test.loc[:,'boosted_lasso'] = boosted_lasso_predicts_test
            city_result_test.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_test
            city_result_test.loc[:, 'mean_model'] = city_result_test[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            meta_learner_train.loc[:,city] = city_result_meta['mean_model']
            meta_learner_test.loc[:,city] = city_result_test['mean_model']   

    # add the target column to the meta_train set and test set and add city column back to dataframe    
    meta_learner_train.loc[:,target]= y_meta_train_full
    meta_learner_test.loc[:,target] = y_test_full
    meta_learner_train.loc[:, 'assigned_city'] = meta_train_full.assigned_city
    meta_learner_test.loc[:, 'assigned_city'] = test_full.assigned_city

    #train meta learner
    clf = LinearRegression()
    clf.fit(meta_learner_train.iloc[:,:-2],meta_learner_train.loc[:,[target]])

    #make prediction for test data
    y_pred = clf.predict(meta_learner_test.iloc[:,:-2])
    naive_pred = [meta_learner_train[[target]].mean().values[0]] * len(meta_learner_test[[target]])

    meta_learner_results.loc[target_city, target] = (100-(metrics.mean_squared_error(meta_learner_test[[target]], y_pred)/metrics.mean_squared_error(meta_learner_test[[target]], naive_pred)*100))
    meta_learner_r2_score.loc[target_city, target] = metrics.r2_score(meta_learner_test[[target]], y_pred)

shape of training data: (1263, 243)
shape of meta training data: (150, 243)
shape of test data: (344, 243)
0.1
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
meta full: (150, 243)
reduced pca: (150, 66)
test full: (344, 243)
reduced pca: (344, 66)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
meta full: (150, 243)
reduced pca: (150, 133)
test full: (344, 243)
reduced pca: (344, 133)
0.1
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.5
shape before pca: (99, 236)
number of pca components: 53
shape after

shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.2
meta full: (150, 243)
reduced pca: (150, 66)
test full: (125, 243)
reduced pca: (125, 66)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
meta full: (150, 243)
reduced pca: (150, 133)
test full: (125, 243)
reduced pca: (125, 133)
0.1
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.5
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.5
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.5
meta full: (150, 243)
reduced pca: (150, 53)
test full: (125, 243)
reduced pca: (125, 53)
0.1


In [40]:
target = 'income_levels'

for target_city in ['marseille', 'lyon', 'paris', 'berlin', 'hamburg', 'bremen']:

    train_full = pd.DataFrame(columns = agg_full.columns)
    meta_train_full = pd.DataFrame(columns = agg_full.columns)
    test_full = pd.DataFrame(columns = agg_full.columns)

    # filter data for respective city
    for city in ['marseille', 'lyon', 'paris', 'berlin', 'hamburg', 'bremen']:


        subset = agg_full[agg_full.assigned_city == city]

        if (target_city != city):
        # create three subsets - training data, meta_training_data, test_data
            train, meta_train = train_test_split(subset, test_size=30, random_state=41)
            # append created datasets
            train_full = train_full.append(train)
            meta_train_full = meta_train_full.append(meta_train)

        else:
            test_full = subset

    #reset index for all three subsets
    train_full = train_full.reset_index(drop = True)
    meta_train_full = meta_train_full.reset_index(drop = True)
    test_full = test_full.reset_index(drop = True)

    print('shape of training data: '+str(train_full.shape))
    print('shape of meta training data: '+str(meta_train_full.shape))
    print('shape of test data: '+str(test_full.shape))

    cities = ['marseille', 'lyon', 'paris', 'hamburg', 'bremen', 'berlin', target]
    cities.remove(target_city)

    meta_learner_train = pd.DataFrame(columns = cities)
    meta_learner_test = pd.DataFrame(columns = cities)

    # split data in feature and target
    X_meta_train_full = meta_train_full.iloc[:,2:-5]
    y_meta_train_full = meta_train_full[[target]]

    X_test_full = test_full.iloc[:,2:-5]
    y_test_full = test_full[[target]]

    for city in ['marseille', 'lyon', 'paris']:
        country = 'FR'

        if(city != target_city):
            #filter out data of respective city
            city_training_data = train_full[train_full.assigned_city == city]

            # Lasso Boosted
            #get columns lasso model selects
            cols_lasso = get_best_lasso_model(agg=city_training_data, target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output ='used_columns',meta_learner= True)
            # get xgboost classifier
            boosted_lasso = train_xgboost(city_training_data, target, cols_lasso, 'classifier', learner_type = 'meta')
            #make prediction for meta_train data and test data
            boosted_lasso_predicts_meta = boosted_lasso.predict(X_meta_train_full[cols_lasso])
            boosted_lasso_predicts_test = boosted_lasso.predict(X_test_full[cols_lasso])

            # PCA Lasso boosted
            # get the components pca returns
            comps = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'components', meta_learner=True)
            # get columns to selct the most important components from pca
            cols_pca_lasso = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'used_columns', meta_learner=True)
            # get fitted pca model
            pca = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=True)
            #create reduced training data based on pca components
            reduced_data = pd.DataFrame(comps)
            reduced_data = reduced_data.join(city_training_data.iloc[:,-5:].reset_index(drop = True))
            #get trained xgboost
            boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'meta')

            #apply pca to meta_train data
            print('meta full: '+str(meta_train_full.shape))
            reduced_X_meta_train_full = pd.DataFrame(pca.transform(X_meta_train_full))
            print('reduced pca: '+str(reduced_X_meta_train_full.shape))

            # apply pca to test data
            print('test full: '+str(test_full.shape))
            reduced_X_test_full = pd.DataFrame(pca.transform(X_test_full))
            print('reduced pca: '+str(reduced_X_test_full.shape))

            # make predictions for meta_train data and test data
            boosted_pca_lasso_predicts_meta = boosted_pca_lasso.predict(reduced_X_meta_train_full[cols_pca_lasso])
            boosted_pca_lasso_predicts_test = boosted_pca_lasso.predict(reduced_X_test_full[cols_pca_lasso])

            #store predictions for meta_train in new dataframe and get the mean value of both predictions
            city_result_meta = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_meta.loc[:,'boosted_lasso'] = boosted_lasso_predicts_meta
            city_result_meta.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_meta
            city_result_meta.loc[:, 'mean_model'] = city_result_meta[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            #store predictions for test_data in new dataframe and get the mean value of both predictions
            city_result_test = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_test.loc[:,'boosted_lasso'] = boosted_lasso_predicts_test
            city_result_test.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_test
            city_result_test.loc[:, 'mean_model'] = city_result_test[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            # add prediction to full data sets for meta_training and test
            meta_learner_train.loc[:,city] = city_result_meta['mean_model']
            meta_learner_test.loc[:,city] = city_result_test['mean_model']

    for city in ['bremen', 'hamburg', 'berlin']:

        if (city != target_city):

            country = 'DE'

            city_training_data = train_full[train_full.assigned_city == city]

            # Lasso Boosted
            cols_lasso = get_best_lasso_model(agg=city_training_data, target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output ='used_columns',meta_learner= True)
            boosted_lasso = train_xgboost(city_training_data, target, cols_lasso, 'classifier', learner_type = 'meta')
            boosted_lasso_predicts_meta = boosted_lasso.predict(X_meta_train_full[cols_lasso])
            boosted_lasso_predicts_test = boosted_lasso.predict(X_test_full[cols_lasso])

            # PCA Lasso boosted
            comps = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'components', meta_learner=True)
            cols_pca_lasso = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015,  density_type='count', radius = 1000, output = 'used_columns', meta_learner=True)
            pca = get_best_pca_lasso_model(agg=city_training_data,target=target, city=city, country=country, socio_year=2015, density_type='count', radius = 1000, output = 'pca_classifier', meta_learner=True)
            reduced_data = pd.DataFrame(comps)
            reduced_data = reduced_data.join(city_training_data.iloc[:,-5:].reset_index(drop = True))
            boosted_pca_lasso = train_xgboost(reduced_data, target, cols_pca_lasso, 'classifier', learner_type = 'meta')

            print('meta full: '+str(meta_train_full.shape))
            reduced_X_meta_train_full = pd.DataFrame(pca.transform(X_meta_train_full))
            print('reduced pca: '+str(reduced_X_meta_train_full.shape))

            print('test full: '+str(test_full.shape))
            reduced_X_test_full = pd.DataFrame(pca.transform(X_test_full))
            print('reduced pca: '+str(reduced_X_test_full.shape))

            boosted_pca_lasso_predicts_meta = boosted_pca_lasso.predict(reduced_X_meta_train_full[cols_pca_lasso])
            boosted_pca_lasso_predicts_test = boosted_pca_lasso.predict(reduced_X_test_full[cols_pca_lasso])

            city_result_meta = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_meta.loc[:,'boosted_lasso'] = boosted_lasso_predicts_meta
            city_result_meta.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_meta
            city_result_meta.loc[:, 'mean_model'] = city_result_meta[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            city_result_test = pd.DataFrame(columns = ['boosted_lasso', 'boosted_pca_lasso'])
            city_result_test.loc[:,'boosted_lasso'] = boosted_lasso_predicts_test
            city_result_test.loc[:,'boosted_pca_lasso'] = boosted_pca_lasso_predicts_test
            city_result_test.loc[:, 'mean_model'] = city_result_test[['boosted_lasso', 'boosted_pca_lasso']].mean(axis = 1)

            meta_learner_train.loc[:,city] = city_result_meta['mean_model']
            meta_learner_test.loc[:,city] = city_result_test['mean_model']   

    # add the target column to the meta_train set and test set and add cities back to dataframe    
    meta_learner_train.loc[:,target]= y_meta_train_full
    meta_learner_test.loc[:,target] = y_test_full
    meta_learner_train.loc[:, 'assigned_city'] = meta_train_full.assigned_city
    meta_learner_test.loc[:, 'assigned_city'] = test_full.assigned_city

    #train meta learner
    clf = LinearRegression()
    clf.fit(meta_learner_train.iloc[:,:-2],meta_learner_train.loc[:,[target]])

    #make prediction for test data
    y_pred = clf.predict(meta_learner_test.iloc[:,:-2])
    naive_pred = [meta_learner_train[[target]].mean().values[0]] * len(meta_learner_test[[target]])

    meta_learner_results.loc[target_city, target] = (100-(metrics.mean_squared_error(meta_learner_test[[target]], y_pred)/metrics.mean_squared_error(meta_learner_test[[target]], naive_pred)*100))
    meta_learner_r2_score.loc[target_city, target] = metrics.r2_score(meta_learner_test[[target]], y_pred)

shape of training data: (1263, 243)
shape of meta training data: (150, 243)
shape of test data: (344, 243)
0.1
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.1
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.1
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.1
meta full: (150, 243)
reduced pca: (150, 66)
test full: (344, 243)
reduced pca: (344, 66)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
meta full: (150, 243)
reduced pca: (150, 133)
test full: (344, 243)
reduced pca: (344, 133)
0.1
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.2
shape before pca: (99, 236)
number of pca components: 53
shape after

shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.1
shape before pca: (146, 236)
number of pca components: 66
shape after pca: (146, 66)
0.1
meta full: (150, 243)
reduced pca: (150, 66)
test full: (125, 243)
reduced pca: (125, 66)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
shape before pca: (831, 236)
number of pca components: 133
shape after pca: (831, 133)
0.01
meta full: (150, 243)
reduced pca: (150, 133)
test full: (125, 243)
reduced pca: (125, 133)
0.1
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.2
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.2
shape before pca: (99, 236)
number of pca components: 53
shape after pca: (99, 53)
0.2
meta full: (150, 243)
reduced pca: (150, 53)
test full: (125, 243)
reduced pca: (125, 53)
0.1


In [41]:
meta_learner_r2_score.loc['mean'] = meta_learner_r2_score.mean()
meta_learner_r2_score

Unnamed: 0,unemployment_rate,foreign_nationals,income_levels
marseille,0.115469,0.161379,0.09823
lyon,-0.173374,-0.152299,-0.095444
paris,0.148987,-0.111399,0.134582
berlin,0.198991,-0.077528,0.126903
hamburg,0.070679,0.12844,0.06697
bremen,0.076006,-0.04257,0.139417
mean,0.072793,-0.015663,0.078443


In [42]:
meta_learner_results

Unnamed: 0,unemployment_rate,foreign_nationals,income_levels
marseille,12.189508,17.691429,10.959759
lyon,-16.79867,-14.959591,-9.343855
paris,15.476607,-9.775911,13.609479
berlin,31.147518,-7.203995,13.143534
hamburg,20.359341,12.85219,10.349731
bremen,11.396219,4.625742,18.145471
