In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image

from my_defs import* 
import os
from multiprocessing import cpu_count

seed = 42
jobs = round(cpu_count() / 2)

In [None]:
data_down = pd.read_csv('data_down_full.csv')   
data_up = pd.read_csv('data_up_full.csv')

In [None]:
drop_cols_u  = [ 'connected', 'date', 'year', 'rawTimesamp',
             'month', 'id', 'throughput_var', 'ci',
           'tavg', 'tmax', 'tmin', 'wdir' ,'pres', 'tsun','dir',
                'campus', 'highway', 'urban', 'suburban',
           'Barop', 'Brünninghausen', 'Eichlinghofen', 'Groß-Barop', 'Hombruch', 'Innenstadt Nord',
           'KGV Ruhrwaldstraße', 'Kirchhörde', 'Klinikviertel', 'Kruckel', 'Löttringhausen',
           'Lücklemberg', 'Mitte' , 'Persebeck', 'Renninghausen', 'Salingen', 'Syburg', 'Wellinghofen',
            'Wichlinghofen']

drop_cols_d  = ['connected', 'date', 'year','rawTimesamp',
             'month', 'id','throughput_var', 'ci',
           'tavg', 'tmax', 'tmin', 'wdir' ,'pres', 'tsun','campus', 'highway', 'urban', 'suburban', 'dir',
           'Barop', 'Brünninghausen', 'Eichlinghofen', 'Groß-Barop', 'Hombruch', 
           'KGV Ruhrwaldstraße', 'Kirchhörde', 'Klinikviertel', 'Kruckel', 'Löttringhausen',
           'Lücklemberg', 'Mitte' , 'Persebeck', 'Renninghausen', 'Salingen', 'Syburg', 'Wellinghofen',
            'Wichlinghofen']
            
data_down = data_down.drop(drop_cols_d, axis = 1)
data_up = data_up.drop(drop_cols_u, axis = 1)



# possible feature tranformations
#data_up['rsrp'] = 10**((data_up['rsrp']/10)/1000)
#data_down['rsrp'] = 10**((data_down['rsrp']/10)/1000)

#data_up['rsrq'] = 10**(data_up['rsrq']/20)
#data_down['rsrq'] = 10**(data_down['rsrq']/20)

In [None]:
x_train_d,x_test_d,y_train_d,y_test_d=train_test_split(data_down.drop(['throughput'], axis = 1)
                                               ,data_down["throughput"], test_size=0.2,
                                                       random_state = 42,
                                                      shuffle = False)
    

x_train_u,x_test_u,y_train_u,y_test_u=train_test_split(data_up.drop(['throughput'], axis = 1)
                                               ,data_up["throughput"], test_size=0.2, 
                                                       random_state = 42,
                                                      shuffle  =False) 

# XGB Download

In [None]:
%%time
model = Pipeline(steps=[('scaler', None),
                        ('regressor', xgb.XGBRegressor())])     

pg = [{
    'scaler': [
       # None,
        RobustScaler(),
       # StandardScaler(),
       # MinMaxScaler()
    ],    
    'regressor__booster': [ 
        'gbtree'
    ],                       
    'regressor__n_estimators': [
        105,
       # 100,
        #110,
      #  95,
       # 166,
       # 65
    ],                          
    'regressor__max_depth ': [
        38,
       # None,
       # 37, 
       # 39, 
        #40,
       # 47
        
    ],
    'regressor__learning_rate': [
       # 0.1,
        #0.12,
        0.11,
        #0.099

    ],
    'regressor__min_child_weight': [
        #2,
       # 3,
        #4,
     #   9,
       # 11,
       # 15,
        18,
        #22,
        #25
        
    ],
    'regressor__reg_alpha': [
       # 0.55,
       # 0.65,
        0.6,
       # 0.7
    ],
    'regressor__reg_lambda': [
        0.9,
       # 0.8,
        #0.85,
       # 0.95
    ],
    'regressor__objective': [
        'reg:squarederror',
       # 'reg:squaredlogerror'
    ],   
    'regressor__base_score':[
        0,
        0.5
    ], 
    'regressor__verbosity':[  # kill warnings 
        0  
    ]
}] 
xgb_d = RandomizedSearchCV(
    model, scoring='neg_mean_squared_error', 
    param_distributions=pg, cv=4, refit=True,
    n_jobs=jobs - 1, verbose = 0, random_state = seed)    

# training
xgb_d.fit(x_train_d, y_train_d)


In [None]:
print("Score:", (np.sqrt(-xgb_d.best_score_)))
display("Parameters:", xgb_d.best_params_)    

In [None]:
scaled_xtr_d = scale(x_train_d, x_train_d, RobustScaler())    
scaled_xtest_d = scale(x_train_d, x_test_d, RobustScaler())   


xgb_down = fit_xgb( scaled_xtr_d, y_train_d, # function in my_defs
         objective ='reg:squarederror', booster = "gbtree",
         n_estimators = 105, max_depth  = 38, learning_rate = 0.11,
                           min_child_weight =18 , reg_alpha = 0.6, 
                          reg_lambda = 0.9, base_score = 0.5, random_state = seed)

xgb_score_down = rmse(xgb_down,scaled_xtest_d, y_test_d) # function in my_defs


In [None]:


%%time
# pipeline models: Scaler -> RFR
model = Pipeline(steps=[('scaler', None),
                        ('regressor', xgb.XGBRegressor())])     
# grid search params
pg = [{
    'scaler': [
       #None,
        RobustScaler()
       #StandardScaler(),
       #MinMaxScaler()
    ],    
    'regressor__booster': [ 
        'gbtree'
    ],                       
    'regressor__n_estimators': [
     #  100,
     # 135,
      # 125,
     #  120,
        130,
       #150
        
    ],                          
    'regressor__max_depth ': [
        #one,
       #3,
        #,
      # 12,
      # 17,
      #20,
        26,
      # 31,
       #37,
       #40,
       #60

    ],
    'regressor__learning_rate': [
     #  0.12,
       0.11
       #0.10,
       #0.15
    ],
   'regressor__min_child_weight': [
       #,
   #  7,
       6
     # 10,
    #  8
   ],
   'regressor__reg_alpha': [
       0.35
     # 0.3,
      #0.55,
      #.25
   ],
   'regressor__reg_lambda': [
      #0.9,
       0.85,
      #0.95,
      #0.8
   ],
    'regressor__objective': [
        'reg:squarederror'
    ],
    'regressor__base_score':[0],
    'regressor__verbosity':[0]
        
        }] 
xgb_u = RandomizedSearchCV(
    model, scoring='neg_mean_squared_error', 
    param_distributions=pg, cv=4,refit=True,
    n_jobs=jobs - 1, verbose = 0, random_state = seed)    

# training
xgb_u.fit(x_train_u, y_train_u)


In [None]:
print("Score:", np.sqrt(-xgb_u.best_score_))
display("Parameters:", xgb_u.best_params_)

In [None]:
scaled_xtr_u = scale(x_train_u, x_train_u, RobustScaler())
scaled_xtest_u = scale(x_train_u, x_test_u, RobustScaler())


xgb_up = fit_xgb( scaled_xtr_u, y_train_u, 
         objective ='reg:squarederror', booster = "gbtree",
         n_estimators =130, max_depth  = 27, learning_rate = 0.11,
                           min_child_weight = 6, reg_alpha = 0.35, 
                          reg_lambda = 0.85, base_score = 0, random_state = seed)

xgb_score_up = rmse(xgb_up,scaled_xtest_u, y_test_u)

# Random Forest Download

In [None]:
%%time
model = Pipeline(steps=[('scaler', None),
                        ('regressor', RandomForestRegressor())])     

pg = [{
    'scaler': [
        None ,
      #  StandardScaler(), 
      #  RobustScaler()
    ],    
    'regressor__n_estimators': [ 
       # 128,
        125,
       # 140,
      #  130
        
        
        #115
    ],                             
    'regressor__max_depth': [
        #20,
      #  25,
       # 15,
      #  20,
        30,
       # 35,
       # 40
    ],
    'regressor__min_samples_split': [
        #None,
       # 3,
        9,
        #8,
        #10,
       # 11
        #6
       # 8
        
    ],
    'regressor__max_features': [
        'auto'
    ],
    'regressor__ccp_alpha': [
        0.0035455
        #0.003575, 
        #0.0035465
    ],   
    'regressor__max_samples':[
      #  None,
      #  0.35,
      #  0.5,
        #0.4,
        0.45
        #0.25
    ]
}] 
rf_d = RandomizedSearchCV(
    model, scoring='neg_mean_squared_error', 
    param_distributions=pg, cv=4, refit=True,
    n_jobs=jobs - 1, verbose = 0, random_state = seed)    


rf_d.fit(x_train_d, y_train_d)

In [None]:
print("Score:", (np.sqrt(-rf_d.best_score_)))
display("Parameters:", rf_d.best_params_)    

In [None]:
#scaled_xtr = scale(x_train_d, x_train_d, StandardScaler())
#scaled_xtest = scale(x_train_d, x_test_d, StandardScaler())


rf_down = fit_rf( x_train_d, y_train_d, n_estimators = 125, max_depth = 30,
             min_samples_split = 9,max_samples = 0.45,
                 ccp_alpha = 0.0035455, random_state = seed)

rf_score_down = rmse(rf_down,x_test_d, y_test_d)

# Random Forest Upload

In [None]:
%%time
model = Pipeline(steps=[('scaler', None),
                        ('regressor', RandomForestRegressor())])     

pg = [{
    'scaler': [
        None 
    #   StandardScaler(), 
     #  RobustScaler()
    ],    
    'regressor__n_estimators': [ 
        #20,
        150
        #00,
        #25,
        #15,
        
       #110,
       #90,
      # 130,
        #40

    ],                             
    'regressor__max_depth': [
     #  None,
     #  16,
        65
        #9,
        #7
     ## 20,
     #  25
      # 
        
    ],
    'regressor__min_samples_split': [
        
       #10,
       #3,
       5,
      # 4
        #,
        #0
        #6
    ],
    'regressor__max_features': [
        'auto'
    ],
    'regressor__max_samples':[
        #one,
       #0.5,
        0.7,
       #0.65,
       #0.75
    ],
    'regressor__ccp_alpha':[

        0.001462

    ]
     
}] 
rf_u = RandomizedSearchCV(
    model, scoring='neg_mean_squared_error', 
    param_distributions=pg, cv=4,refit=True,
    n_jobs=jobs - 1, verbose = 0, random_state = seed)    

# training
rf_u.fit(x_train_u, y_train_u)

In [None]:
print("Score:", (np.sqrt(-rf_u.best_score_)))
display("Parameters:", rf_u.best_params_)    

In [None]:
#scaled_xtr = scale(x_train_u, x_train_u, RobustScaler())
#scaled_xtest = scale(x_train_u, x_test_u, RobustScaler())


rf_up = fit_rf( x_train_u, y_train_u, n_estimators = 150, max_depth = 65,
             min_samples_split = 4, max_samples = 0.7,
               ccp_alpha =0.001462,random_state = seed )

rf_score_up = rmse(rf_up,x_test_u, y_test_u)

# Visualization 

In [None]:
pred = xgb_down.predict(scaled_xtest_d)
visualize_prediction_value_ordered_examples(y_test_d, pred,  # function in my_defs
                                            title = 'XGBoost Download Predictions and Ground Truth Values',
                                           score = xgb_score_down)

In [None]:
pred = xgb_up.predict(scaled_xtest_u)
visualize_prediction_value_ordered_examples(y_test_u, pred, # function in my_defs
                                            title = 'XGBoost Upload Predictions and Ground Truth Values',
                                            score = xgb_score_up)

In [None]:
pred = rf_down.predict(x_test_d)
visualize_prediction_value_ordered_examples(y_test_d, pred, 
                                            title = 'Random Forrest Download Predictions and Ground Truth Values',
                                           score = rf_score_down)

In [None]:
pred = rf_up.predict(x_test_u)
visualize_prediction_value_ordered_examples(y_test_u, pred,
                                            title = 'Random Forrest Upload Predictions and Ground Truth values',
                                           score = rf_score_up)

# Learning Curve

In [None]:
%%time

try:
    display(Image(filename='rf_learning_curve.png'))
    
except FileNotFoundError:
    _, axes = plt.subplots(1,2, figsize=(35, 18))
    plot_learning_curves( # function in my_defs 
       rf_u, x_train_u, y_train_u,random_state = seed,
        train_sizes=np.linspace(0.01, 1.0, 50), cv=3,
        scoring='neg_root_mean_squared_error', model_name='RandomForrest Learning Curve (Upload)', 
        ax=axes[0], n_jobs=-1)
    plot_learning_curves( # function in my_defs
        rf_d, x_train_d, y_train_d,random_state = seed,
        train_sizes=np.linspace(0.01, 1.0, 50), cv=3,
        scoring='neg_root_mean_squared_error', model_name='RandomForrest Learning Curve (Download)', 
        ax=axes[1], n_jobs=-1)
    plt.savefig('rf_learning_curve' + '.png', dpi=200)

In [None]:
%%time

try:
    display(Image(filename='xgb_learning_curve.png'))
    
except FileNotFoundError:
    _, axes = plt.subplots(1,2, figsize=(35, 18))
    plot_learning_curves( # function in my_defs
        xgb_u, x_train_u, y_train_u,random_state = seed,
        train_sizes=np.linspace(0.01, 1.0, 40), cv=3,
        scoring='neg_root_mean_squared_error', model_name='XGBoost Learnong Curve (Upload)', 
        ax=axes[0], n_jobs=-1)
    plot_learning_curves( # function in my_defs
        xgb_d, x_train_d, y_train_d,random_state = seed,
        train_sizes=np.linspace(0.01, 1.0, 40), cv=3,
        scoring='neg_root_mean_squared_error', model_name='XGBoost Learning Curve (Download)', 
        ax=axes[1], n_jobs=-1)
    plt.savefig('xgb_learning_curve' + '.png', dpi=200)