In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error,explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import connectorx as cx
import seaborn as sns
import mlflow
from mlflow.models import infer_signature
import shap
import os

In [78]:
#!pip install shap==0.44.1




In [2]:
print(shap.__version__)

0.44.1


In [9]:
pd.set_option('display.max_columns',None)

In [8]:
sql_command="""
select * from(SELECT DISTINCT isg_osm_id,next_isg_osm_id AS conn_isg_osm_id,
	cur_fc,next_fc AS conn_fc,
	highway_id ,next_highway_id AS conn_highway_id,
	countyid,next_countyid AS conn_countyid,
	region ,next_region AS conn_region,
	roadwaytype_id, next_roadwaytype_id as conn_roadwaytype_id,
	next_connected_count AS conn_count,next_pcc AS conn_ncc_pcc,
	cur_lanes,next_lanes AS conn_lanes,
	cur_maxspeed,next_maxspeed AS conn_maxspeed,
	cur_final_place ,next_final_place AS conn_final_place,
	volume,next_volume AS conn_volume,
'next' AS conn_tag
FROM tm_new_data.nys_hour_8_volume_240321
WHERE next_isg_osm_id>0 AND volume>0 AND cur_fc IN (1,2,3) AND next_fc IN (1,2,3) AND next_connected_count > 1 AND next_pcc=1
UNION 
SELECT DISTINCT isg_osm_id, prev_isg_osm_id, 
	cur_fc, prev_fc, 
	highway_id, prev_highway_id, 
	countyid, prev_countyid,
	region, prev_region, 
	roadwaytype_id, prev_roadwaytype_id,
	prev_connected_count, prev_ncc,  
	cur_lanes, prev_lanes, 
	cur_maxspeed, prev_maxspeed,
	cur_final_place, prev_final_place,
	volume, prev_volume,
	'prev' as conn_tag
FROM tm_new_data.nys_hour_8_volume_240321 
WHERE prev_isg_osm_id >0 AND volume>0 AND cur_fc IN (1,2,3) AND prev_fc IN (1,2,3) AND prev_connected_count = 1 AND prev_ncc > 1)foo
"""

data=cx.read_sql("postgres://postgres:123@NjMpTs@20.62.75.39:9030/isgpostgres",sql_command)

In [10]:
data.columns

Index(['isg_osm_id', 'conn_isg_osm_id', 'cur_fc', 'conn_fc', 'highway_id',
       'conn_highway_id', 'countyid', 'conn_countyid', 'region', 'conn_region',
       'roadwaytype_id', 'conn_roadwaytype_id', 'conn_count', 'conn_ncc_pcc',
       'cur_lanes', 'conn_lanes', 'cur_maxspeed', 'conn_maxspeed',
       'cur_final_place', 'conn_final_place', 'volume', 'conn_volume',
       'conn_tag'],
      dtype='object')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9001 entries, 0 to 9000
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isg_osm_id           9001 non-null   Int64 
 1   conn_isg_osm_id      9001 non-null   Int64 
 2   cur_fc               9001 non-null   Int64 
 3   conn_fc              9001 non-null   Int64 
 4   highway_id           9001 non-null   Int64 
 5   conn_highway_id      9001 non-null   Int64 
 6   countyid             9001 non-null   Int64 
 7   conn_countyid        9001 non-null   Int64 
 8   region               9001 non-null   Int64 
 9   conn_region          9001 non-null   Int64 
 10  roadwaytype_id       9001 non-null   Int64 
 11  conn_roadwaytype_id  9001 non-null   Int64 
 12  conn_count           9001 non-null   Int64 
 13  conn_ncc_pcc         9001 non-null   Int64 
 14  cur_lanes            9001 non-null   Int64 
 15  conn_lanes           9001 non-null   Int64 
 16  cur_ma

In [12]:
f_places=set(data['cur_final_place'].unique().tolist()+data['conn_final_place'].unique().tolist())
places={}

for cnt , i in enumerate(f_places):
    places.update({i:cnt})

data['cur_final_place_enc']=list(map(lambda x:places[x],data['cur_final_place']))
data['conn_final_place_enc']=list(map(lambda x:places[x],data['conn_final_place']))
data

Unnamed: 0,isg_osm_id,conn_isg_osm_id,cur_fc,conn_fc,highway_id,conn_highway_id,countyid,conn_countyid,region,conn_region,roadwaytype_id,conn_roadwaytype_id,conn_count,conn_ncc_pcc,cur_lanes,conn_lanes,cur_maxspeed,conn_maxspeed,cur_final_place,conn_final_place,volume,conn_volume,conn_tag,cur_final_place_enc,conn_final_place_enc
0,32215673003,32215673002,1,1,1,1,94,94,10,10,10,10,1,2,2,2,55,55,locality,locality,2824,3281,prev,5,5
1,46189288001,166038501001,2,2,3,3,84,84,11,11,10,10,2,1,3,2,35,35,neighbourhood,neighbourhood,1228,-1,next,6,6
2,20110634001,20110634002,1,1,2,2,42,42,4,4,16,16,2,1,1,1,25,25,town,town,229,-1,next,3,3
3,987686553001,962631300001,1,1,1,1,83,83,11,11,12,12,1,2,3,3,50,50,city,city,3255,3725,prev,0,0
4,5647861001,5642412001,1,1,1,2,76,76,1,1,10,16,2,1,2,1,55,25,town,town,412,7,next,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8996,5647861002,5647861001,1,1,1,1,76,76,1,1,10,10,1,2,2,2,55,55,town,town,405,412,prev,3,3
8997,684488140002,684488140001,3,3,6,6,83,83,11,11,16,16,1,2,1,1,25,25,city,city,1008,-1,prev,0,0
8998,935852141005,24114123001,1,1,1,2,92,92,10,10,10,16,2,1,3,1,55,25,locality,locality,527,204,next,5,5
8999,20037392001,38624580001,1,1,2,1,36,36,5,5,16,10,1,2,1,2,25,65,city,city,40,-1,prev,0,0


In [52]:
#speed_cat=set(data['cur_maxspeed'].unique().tolist()+data['conn_maxspeed'].unique().tolist())
#speed={}
#for cnt , i in enumerate(speed_cat):
#    speed.update({i:cnt})
#data['cur_maxspeed_enc']=list(map(lambda x:speed[x],data['cur_maxspeed']))
#data['conn_maxspeed_enc']=list(map(lambda x:speed[x],data['conn_maxspeed']))

In [13]:
seen_data=data[data['conn_volume']>0]
unseen_data=data[data['conn_volume']<0]
seen_data

Unnamed: 0,isg_osm_id,conn_isg_osm_id,cur_fc,conn_fc,highway_id,conn_highway_id,countyid,conn_countyid,region,conn_region,roadwaytype_id,conn_roadwaytype_id,conn_count,conn_ncc_pcc,cur_lanes,conn_lanes,cur_maxspeed,conn_maxspeed,cur_final_place,conn_final_place,volume,conn_volume,conn_tag,cur_final_place_enc,conn_final_place_enc
0,32215673003,32215673002,1,1,1,1,94,94,10,10,10,10,1,2,2,2,55,55,locality,locality,2824,3281,prev,5,5
3,987686553001,962631300001,1,1,1,1,83,83,11,11,12,12,1,2,3,3,50,50,city,city,3255,3725,prev,0,0
4,5647861001,5642412001,1,1,1,2,76,76,1,1,10,16,2,1,2,1,55,25,town,town,412,7,next,3,3
6,28772519001,352006365001,1,1,2,2,92,92,10,10,16,16,1,2,1,2,25,25,village,village,292,1740,prev,1,1
7,20197226001,448596015005,1,1,2,1,68,68,8,8,16,10,1,2,1,2,40,55,town,town,44,1145,prev,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8994,999149827998001,9991033574469002,2,2,3,3,92,92,10,10,1,1,2,1,1,1,55,55,locality,locality,1782,1675,next,5,5
8995,382313898001,124083370001,1,1,2,1,76,76,1,1,16,10,1,2,1,3,55,55,town,city,337,711,prev,3,0
8996,5647861002,5647861001,1,1,1,1,76,76,1,1,10,10,1,2,2,2,55,55,town,town,405,412,prev,3,3
8998,935852141005,24114123001,1,1,1,2,92,92,10,10,10,16,2,1,3,1,55,25,locality,locality,527,204,next,5,5


In [14]:
total_feature_lst=['isg_osm_id', 'conn_isg_osm_id', 'cur_fc', 'conn_fc', 'highway_id',
       'conn_highway_id', 'countyid', 'conn_countyid', 'region', 'conn_region',
       'roadwaytype_id', 'conn_roadwaytype_id', 'conn_count', 'conn_ncc_pcc',
       'cur_lanes', 'conn_lanes', 'cur_maxspeed', 'conn_maxspeed', 'cur_maxspeed_enc', 'conn_maxspeed_enc',
       'cur_final_place', 'conn_final_place', 'volume', 'conn_volume',
       'conn_tag','cur_final_place_enc','conn_final_place_enc']

input_feature_lst=['cur_fc', 'conn_fc', 'highway_id',
       'conn_highway_id', 'countyid', 'conn_countyid', 'region', 'conn_region',
       'roadwaytype_id', 'conn_roadwaytype_id', 'conn_count', 'conn_ncc_pcc',
       'cur_lanes', 'conn_lanes', 'cur_maxspeed', 'conn_maxspeed',
        'cur_final_place_enc','conn_final_place_enc','volume']

target_feature_lst=['conn_volume']

In [15]:
unique_seen_data=seen_data[input_feature_lst+target_feature_lst].drop_duplicates()
unique_seen_data.shape

(4834, 20)

In [64]:
def log_model_summary(xgb_reg,X_train,X_val,X_test,y_train,y_val,y_test):
    test_data=X_test.copy()
    test_data['conn_volume']=y_test['conn_volume']
    test_data['predicted_conn_volume']=y_pred
    test_data['predicted_conn_volume']=test_data['predicted_conn_volume'].round().astype(int)
    test_data['diff']=test_data['conn_volume']-test_data['predicted_conn_volume']
    test_data['geh_value']=test_data.apply(lambda row:geh_check(row['conn_volume'],row['predicted_conn_volume']),axis=1)
    test_data['geh_value']=test_data['geh_value'].round(2)
    
    #Log Parameters
    xgb_params = xgb_reg.get_xgb_params()
    mlflow.log_params(xgb_params)

    # Log Metrics
    mse = round(mean_squared_error(y_test, y_pred),3)
    rmse = round(np.sqrt(mse),3)
    r2 = round(r2_score(y_test, y_pred),3)
    n = X_test.shape[0]
    p = X_test.shape[1]
    adjusted_r2score =  round((1 - (1 - r2) * (n - 1) / (n - p - 1)),3)
    mae = round(mean_absolute_error(y_test, y_pred),3)
    
    m_geh = round(mean_geh(y_test.values, y_pred),3)
    geh_grt_5 = test_data[test_data['geh_value']>5].shape[0]
    geh_error_per = round(((geh_grt_5/X_test.shape[0])*100),3)
    
    metrics = {'mse':mse,'rmse':rmse,'r2score':r2,'mae':mae,'adjusted_r2score' : adjusted_r2score,
               'mean_geh':m_geh,'geh_grt_5':geh_grt_5,'geh_error':geh_error_per,'geh_error_per':geh_error_per}
    
    mlflow.log_metrics(metrics)
    
    
     # log tags
    tags = {'Model_Name' : 'Stacking',
            
            'Train Size' : X_train.shape[0],
            'Valid Size' : X_val.shape[0],
            'Test Size' : X_test.shape[0],
            'No. Of Input Feature' : len(input_feature_lst),
            'No. Of Target Feature ': len(target_feature_lst),
            'input_feature_lst':input_feature_lst,
            'target_feature_lst': target_feature_lst}
    
    mlflow.set_tags(tags)
    
    return test_data

In [17]:
def geh_check(c_vol,n_vol):
    cur_vol=c_vol
    next_vol=n_vol
    geh_value=float(np.sqrt(2*np.sum((cur_vol-next_vol)**2)/np.sum(cur_vol+next_vol)))
    
    return geh_value

In [18]:
import statistics
def mean_geh(c_vol,n_vol):
    geh_lst=[]
    
    for i,j in zip(c_vol,n_vol):
        geh_lst.append(geh_check(i,j))
    
    return statistics.mean(geh_lst)

In [63]:
def stacking_model(X_train,X_val,X_test,y_train,y_val,y_test):
    
    params= {'subsample': 0.8, 'n_estimators': 10000, 'max_depth': 8,'learning_rate': 0.02, 
             'colsample_bytree': 0.8, 'gamma' : 5 , 'tree_method': 'exact', 'alpha': 5, 'lambda': 5,
             'min_child_weight':5,
            'objective': 'count:poisson','base_score': np.log(np.mean(y_train)),
             'early_stopping_rounds': 20, 'eval_metric': ['rmse', 'mae']}
    
    #XGBRegressor
    eval_set=[(X_train,y_train),(X_val,y_val)]
    xgb_reg=xgb.XGBRegressor(**params)
    xgb_reg.fit(X_train,y_train,eval_set=eval_set)
    xgb_val_pred=xgb_reg.predict(X_val)
    xgb_test_pred=xgb_reg.predict(X_test)
 
    
    #AdaBoost
    adaboost=AdaBoostRegressor(n_estimators=10000,learning_rate=0.02)
    adaboost.fit(X_train,y_train)
    ada_val_pred=adaboost.predict(X_val)
    ada_test_pred=adaboost.predict(X_test)
    
    val_set=X_val.copy()
    val_set['xgb']=xgb_val_pred
    val_set['ada']=ada_val_pred
    test_set=X_test.copy()
    test_set['xgb']=xgb_test_pred
    test_set['ada']=ada_test_pred
  
    
    return val_set,test_set

In [32]:
X,y=unique_seen_data[input_feature_lst],unique_seen_data[target_feature_lst]
X_train,X_temp,y_train,y_temp=train_test_split(X,y,train_size=0.7,random_state=42)             #Test
X_val,X_test,y_val,y_test=train_test_split(X_temp,y_temp,train_size=0.5,random_state=42)       #Validation

Unnamed: 0,cur_fc,conn_fc,highway_id,conn_highway_id,countyid,conn_countyid,region,conn_region,roadwaytype_id,conn_roadwaytype_id,conn_count,conn_ncc_pcc,cur_lanes,conn_lanes,cur_maxspeed,conn_maxspeed,cur_final_place_enc,conn_final_place_enc,volume,xgb,ada
7215,1,1,2,1,82,82,11,11,16,10,1,2,1,4,40,45,0,0,92,3995.202393,3697.649672
2956,1,1,1,1,94,94,10,10,10,10,1,2,4,4,55,55,5,5,2607,4018.082764,4703.476813
5998,3,3,5,5,84,84,11,11,1,1,2,1,1,2,25,25,6,6,436,456.073700,923.425065
3258,1,1,1,1,49,49,6,6,10,10,1,2,2,2,65,65,3,3,909,1002.579285,1148.037584
1066,1,1,2,1,48,48,3,3,16,13,1,2,1,3,25,65,3,3,260,1201.449219,2554.087318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,1,1,1,1,42,42,4,4,10,10,2,1,3,3,65,65,3,3,1747,1508.631592,2494.165308
3159,1,1,1,1,89,89,11,11,10,10,1,2,3,3,50,50,0,0,4149,4510.565918,4135.312982
3838,1,1,1,2,92,92,10,10,10,16,2,1,2,1,55,25,1,5,4216,743.018250,884.924171
6281,1,1,1,1,82,82,11,11,10,10,2,1,4,3,50,50,0,0,5336,4615.131348,4056.339286


In [22]:
os.environ['MLFLOW_TRACKING_USERNAME']='isgadmin'
os.environ['MLFLOW_TRACKING_PASSWORD']='infosense12@'
tracking_server_uri = "https://ptt2-mlflow.isgsuite.com/"
mlflow.set_tracking_uri(tracking_server_uri)
tracking_uri=mlflow.get_tracking_uri()
print("Tracking URI:",tracking_uri)

Tracking URI: https://ptt2-mlflow.isgsuite.com/


In [62]:
#experiment_name = "TurningMovement_Latest"
#experiment = mlflow.set_experiment(experiment_name)
#experiment_id = experiment.experiment_id
run_name = 'stacking_with_fc_123_run without capacity'
description = 'prediction of conn_volume where (pcc=1 and prev_ncc>1) and (ncc>1 and next_pcc=1) for fc 1,2,3 (Models:XGB,ADB)'
# Get Experiment Details
print(f"Experiment_id: {experiment.experiment_id}")
print(f"Run Name: {run_name}")
print(f"Description: {description}")
# print(f"Artifact Location: {experiment.artifact_location}")

Experiment_id: 448241412080362865
Run Name: stacking_with_fc_123_run without capacity
Description: prediction of conn_volume where (pcc=1 and prev_ncc>1) and (ncc>1 and next_pcc=1) for fc 1,2,3 (Models:XGB,ADB)


In [87]:
with mlflow.start_run(experiment_id="448241412080362865",run_name=run_name,description=description) as run:

    params= {'subsample': 0.8, 'n_estimators': 10000, 'max_depth': 8,'learning_rate': 0.02, 
             'colsample_bytree': 0.8, 'gamma' : 5 , 'tree_method': 'exact', 'alpha': 5, 'lambda': 5,
             'min_child_weight':5,
            'objective': 'count:poisson','base_score': np.log(np.mean(y_train)),
             'early_stopping_rounds': 20, 'eval_metric': ['rmse', 'mae']}
    
    #XGBRegressor
    xgb_reg=xgb.XGBRegressor(**params)
    xgb.set_param(early_stopping_rounds=None)
    
    #Train the model
    eval_set=[(X_train,y_train),(X_val,y_val)]

    #xgb_reg.fit(X_train,y_train,eval_set=eval_set)
    

    
    #mlflow.shap.log_explanation(xgb_reg.predict,X_test)
    
    #Stacking
    val_set,test_set=stacking_model(X_train,X_val,X_test,y_train,y_val,y_test)
    xgb_reg.fit(val_set,y_val,eval_set=None,verbose=False)
   
    
    #Make predictions on test set
    y_pred=xgb_reg.predict(test_set)
    
    #Log model summary
    Test_data=log_model_summary(xgb_reg,X_train,X_val,X_test,y_train,y_val,y_test)
    
    #Feature Importance
    explainer=shap.TreeExplainer(xgb_reg)
    shap_values=explainer.shap_values(test_set)
    fig=shap.summary_plot(shap_values,test_set,plot_type='bar',show=False)
    image_path="Feature_Importance.png"
    plt.savefig(image_path)
    mlflow.log_artifact(image_path,"images")
    
    #log learning curve
    results = xgb_reg.evals_result()
    training_rounds = range(len(results['validation_0']['rmse']))
    plt.figure(figsize=(12, 6))
    plt.plot(training_rounds, results['validation_0']['rmse'], label='Training Loss')
    plt.plot(training_rounds, results['validation_1']['rmse'], label='Validation Loss')
    plt.xlabel('Boosting Round')
    plt.ylabel('RMSE') 
    plt.title('XGBoost Training and Validation Loss')
    plt.legend()

    image_path = "Learning_Curve.png"
    plt.savefig(image_path)
    mlflow.log_artifact(image_path, "images")
    
    #Actual vs. Predictions
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=Test_data,x='conn_volume',y='predicted_conn_volume',color='red' )
    sns.lineplot(x=Test_data['conn_volume'],y=Test_data['predicted_conn_volume'],color='blue')
    plt.title('Actual vs. Predicted')
    plt.xlabel('Actual Volume')
    plt.ylabel('Predicted Volume')
    plt.legend()
    plt.tight_layout()
    image_path="Actual_vs_Prediction_PLot.png"
    plt.savefig(image_path)
    mlflow.log_artifact(image_path,"images")
    
    #Actual vs. Prediction Distribution
    fig,axes=plt.subplots(1,2,figsize=(12,8))
    sns.histplot(Test_data['conn_volume'],color='green',kde=True,ax=axes[0])
    axes[0].set_title("Actual Distribution")
    axes[0].set_xlabel("Values")
    axes[0].set_ylabel("Frequency")
    sns.histplot(Test_data['predicted_conn_volume'],color='blue',kde=True,ax=axes[1])
    axes[1].set_title("Predicted Distribution")
    axes[1].set_xlabel("Values")
    axes[1].set_ylabel("Frequency")
    plt.tight_layout()

    image_path="Actual_vs_Prediction_Distribution.png"
    plt.savefig(image_path)
    mlflow.log_artifact(image_path,"images")
    
    #Residuals
    plt.figure(figsize=(12,6))
    sns.histplot(Test_data['diff'],bins=100,kde=False)
    plt.title("Residual Distributions")
    plt.xlabel("Values")
    plt.ylabel("Frequency")
    plt.xlim(-600,600)

    image_path="Residual_PLot.png"
    plt.savefig(image_path)
    mlflow.log_artifact(image_path,"images")
    
    #Log Model
    signature=infer_signature(X_test,xgb_reg.predict(X_test))
    mlflow.sklearn.log_model(xgb_reg,"model",signature=signature)
    
    mlflow.end_run()

[0]	validation_0-rmse:2469.95407	validation_0-mae:1781.41926	validation_1-rmse:2320.47019	validation_1-mae:1653.61077
[1]	validation_0-rmse:2469.87686	validation_0-mae:1781.31294	validation_1-rmse:2320.39389	validation_1-mae:1653.50399
[2]	validation_0-rmse:2469.79863	validation_0-mae:1781.20517	validation_1-rmse:2320.31661	validation_1-mae:1653.39575
[3]	validation_0-rmse:2469.71910	validation_0-mae:1781.09577	validation_1-rmse:2320.23803	validation_1-mae:1653.28586
[4]	validation_0-rmse:2469.63869	validation_0-mae:1780.98496	validation_1-rmse:2320.15857	validation_1-mae:1653.17455
[5]	validation_0-rmse:2469.55700	validation_0-mae:1780.87257	validation_1-rmse:2320.07786	validation_1-mae:1653.06161
[6]	validation_0-rmse:2469.47411	validation_0-mae:1780.75859	validation_1-rmse:2319.99597	validation_1-mae:1652.94706
[7]	validation_0-rmse:2469.39020	validation_0-mae:1780.64307	validation_1-rmse:2319.91306	validation_1-mae:1652.83096
[8]	validation_0-rmse:2469.30500	validation_0-mae:1780.5

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Parameters: { "early_stopping_round" } are not used.



ValueError: Must have at least 1 validation dataset for early stopping.

In [31]:
X_train['conn_volume']=y_train['conn_volume']

In [32]:
X_train=pd.merge(seen_data[['isg_osm_id','conn_isg_osm_id']],X_train,left_index=True,right_index=True)
X_train

Unnamed: 0,isg_osm_id_x,conn_isg_osm_id_x,isg_osm_id_y,conn_isg_osm_id_y,cur_fc,conn_fc,highway_id,conn_highway_id,countyid,conn_countyid,region,conn_region,roadwaytype_id,conn_roadwaytype_id,conn_count,conn_ncc_pcc,cur_lanes,conn_lanes,cur_maxspeed,conn_maxspeed,cur_final_place_enc,conn_final_place_enc,volume,conn_volume
0,32215673003,32215673002,32215673003,32215673002,1,1,1,1,94,94,10,10,10,10,1,2,2,2,55,55,6,6,2824,3281
3,987686553001,962631300001,987686553001,962631300001,1,1,1,1,83,83,11,11,12,12,1,2,3,3,50,50,4,4,3255,3725
4,5647861001,5642412001,5647861001,5642412001,1,1,1,2,76,76,1,1,10,16,2,1,2,1,55,25,1,1,412,7
6,28772519001,352006365001,28772519001,352006365001,1,1,2,2,92,92,10,10,16,16,1,2,1,2,25,25,5,5,292,1740
7,20197226001,448596015005,20197226001,448596015005,1,1,2,1,68,68,8,8,16,10,1,2,1,2,40,55,1,1,44,1145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8986,39352199002,20111580001,39352199002,20111580001,1,1,1,2,42,42,4,4,10,16,2,1,2,1,55,40,1,1,2022,281
8988,538843228001,25429543001,538843228001,25429543001,1,1,2,2,83,83,11,11,18,18,1,2,1,1,50,50,4,4,1210,1630
8990,32131901003,24115658001,32131901003,24115658001,1,1,1,2,92,92,10,10,10,16,2,1,3,1,55,25,6,6,2318,351
8996,5647861002,5647861001,5647861002,5647861001,1,1,1,1,76,76,1,1,10,10,1,2,2,2,55,55,1,1,405,412


In [34]:
unseen_pred=xgb_reg.predict(unseen_data[input_feature_lst])
unseen_pred

array([ 904.1501 ,  125.62899, 3163.8882 , ...,  642.9021 ,  906.60315,
        545.92084], dtype=float32)

In [35]:
unseen_data['conn_volume']=unseen_pred
unseen_data['conn_volume']=unseen_data['conn_volume'].round().astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unseen_data['conn_volume']=unseen_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unseen_data['conn_volume']=unseen_data['conn_volume'].round().astype(int)


In [36]:
unseen_data

Unnamed: 0,isg_osm_id,conn_isg_osm_id,cur_fc,conn_fc,highway_id,conn_highway_id,countyid,conn_countyid,region,conn_region,roadwaytype_id,conn_roadwaytype_id,conn_count,conn_ncc_pcc,cur_lanes,conn_lanes,cur_maxspeed,conn_maxspeed,cur_final_place,conn_final_place,volume,conn_volume,conn_tag,cur_final_place_enc,conn_final_place_enc
1,46189288001,166038501001,2,2,3,3,84,84,11,11,10,10,2,1,3,2,35,35,neighbourhood,neighbourhood,1228,904,next,0,0
2,20110634001,20110634002,1,1,2,2,42,42,4,4,16,16,2,1,1,1,25,25,town,town,229,126,next,1,1
5,124369932003,124369932004,1,1,1,1,42,42,4,4,10,10,2,1,3,3,55,55,city,city,3901,3164,next,4,4
11,12719627001,40118581001,1,1,2,1,57,57,9,9,16,10,1,2,1,2,25,65,town,town,5,330,prev,1,1
16,44164413001,195743216001,3,3,5,5,84,84,11,11,10,10,1,2,4,3,25,25,neighbourhood,neighbourhood,174,844,prev,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8991,998193989441005,998193989441004,3,3,5,5,94,94,10,10,1,1,1,2,2,2,40,40,locality,locality,252,418,prev,6,6
8992,1044700779002,20180331001,1,1,1,2,94,94,10,10,10,16,2,1,3,1,55,25,locality,locality,4287,374,next,6,6
8993,1025754084001,1025754084002,2,2,3,3,86,86,8,8,10,10,2,1,2,2,55,55,town,town,626,643,next,1,1
8997,684488140002,684488140001,3,3,6,6,83,83,11,11,16,16,1,2,1,1,25,25,city,city,1008,907,prev,4,4


In [37]:
unseen_data.to_csv('unseen_prediction.csv')