# Prediction Code

This file demonstrate the code we used to predict the missing values in our dataset.

In [1]:
import warnings
warnings.simplefilter(action='ignore')
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.base import BaseEstimator


In [2]:
def SplitSet(target: str, filename: str) -> pd.DataFrame:
    """
    Input: the target column name, the name of the original dataset
    Output: x, y, xtrain, xtest, ytrain, ytest
    This function split the original file into train and test set.
    """
    df = pd.read_csv(filename)
    df = df.drop(['ID'],axis = 1)
    df = df.dropna(axis=0, how='all')
    df = df.dropna(axis=0,subset = [target])
    y = df[target]
    x = df.drop([target],axis = 1)
    xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.15,random_state =10)
    return x, y, xtrain, xtest, ytrain, ytest

def XgbModel(xtrain: pd.DataFrame, ytrain: pd.DataFrame,x: pd.DataFrame,y: pd.DataFrame) -> BaseEstimator:
    """
    Input: the trainset for x, the training set for y, x, and y
    Output: the model
    This function gives the best xgbmodel using gridserch and cross validation and then return it.
    """
    params = {
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': [100,600,1000],
    'max_depth': [1,3],
    'lambda':[0,0.5,1],
    'alpha':[0,0.5,1]
    }

    xgbr = xgb.XGBRegressor(seed = 20)
    clf = GridSearchCV(estimator=xgbr, 
                    param_grid=params,
                    scoring='neg_mean_absolute_percentage_error', 
                    verbose=1,
                    cv=2,
                    )
    clf.fit(xtrain, ytrain)
    print("Best parameters:", clf.best_params_)
    print("Best scores:", clf.best_score_)

    xgb_model = clf.best_estimator_
    scores = cross_val_score(xgbr, x, y, cv=5, scoring='neg_mean_absolute_percentage_error')
    scores =  np.absolute(scores)
    print('MAPE CV Score: %.3f (%.3f)' % (scores.mean(), scores.std()) )
    
    return xgb_model



def Predict(previous_file: str, target: str, xgbr: BaseEstimator , new_file_name : str , p_v: str) -> None:
    """
    Input: the previous_file name , the name of the target column, the xgb model, the name of the new file, the addition string
    Output: None
    This function add the prediction of a traget column to previous file and then save it in a new file.
    """
    previous_df = pd.read_csv(previous_file)
    orginal_df = pd.read_csv('./data/REST_data_1.csv')

    predict_df = orginal_df[pd.isnull(orginal_df[target])]
    true_x_for_prediction = predict_df.drop(columns=['ID',target],inplace= False)
    ypred = xgbr.predict(true_x_for_prediction)

    ypred = [ '%.2f' % elem for elem in ypred ]

    predict_df[target] = ypred
    predict_df[target] = predict_df[target].apply(lambda x: f"{x}"+p_v)

    previous_df.set_index(['ID'], inplace=True)
    previous_df.update(predict_df.set_index(['ID']))
    previous_df.reset_index( inplace=True)
    previous_df.to_csv(new_file_name,index = False)

In [27]:
#doing first round of prediction
Target = "REST: Staff size MOD" 
print("First round prediction start for:",Target)
x,y,xtrain, xtest, ytrain, ytest = SplitSet(Target,'./data/REST_data_1_Cleaned.csv')
xgbr = XgbModel(xtrain,ytrain, x,y)
Predict('./data/REST_data_1.csv',Target, xgbr,'./data/REST_data_1_predicted_v1_without_label.csv','')
Predict('./data/REST_data_1.csv',Target, xgbr,'./data/REST_data_1_predicted_v1.csv','(p1)')
print("-----Prediction END-------")

Targetlist = [ "REST: No Wtr MOD","REST: No Ktch Wkr MOD","REST: No tables MOD","REST: Monthly Expenses MOD"]
for Target in Targetlist:
    print("First round prediction start for:",Target)
    x,y,xtrain, xtest, ytrain, ytest = SplitSet(Target,'./data/REST_data_1_Cleaned.csv')
    xgbr = XgbModel(xtrain,ytrain, x,y)
    Predict('./data/REST_data_1_predicted_v1_without_label.csv',Target, xgbr,'./data/REST_data_1_predicted_v1_without_label.csv','')
    Predict('./data/REST_data_1_predicted_v1.csv',Target, xgbr,'./data/REST_data_1_predicted_v1.csv','(p1)')
    print("-----Prediction END-------")


#doing second round of prediction
Target = "REST: Monthly Expenses MOD"
print("Second round prediction start for:",Target)
x,y,xtrain, xtest, ytrain, ytest = SplitSet(Target,'./data/REST_data_1_predicted_v1_without_label.csv')
xgbr = XgbModel(xtrain,ytrain, x,y)
Predict('./data/REST_data_1_predicted_v1_without_label.csv',Target, xgbr,'./data/REST_data_1_predicted_v2_without_label.csv','')
Predict('./data/REST_data_1_predicted_v1.csv',Target, xgbr,'./data/REST_data_1_predicted_v2.csv','(p2)')
print("-----Prediction END-------")

Targetlist = ["REST: Total Partners MOD","REST: No Act Part MOD","REST:  No Pass Part MOD"]
for Target in Targetlist:
    print("Second round prediction start for:",Target)
    x,y,xtrain, xtest, ytrain, ytest = SplitSet(Target,'./data/REST_data_1_predicted_v1_without_label.csv')
    xgbr = XgbModel(xtrain,ytrain, x,y)
    Predict('./data/REST_data_1_predicted_v2_without_label.csv',Target, xgbr,'./data/REST_data_1_predicted_v2_without_label.csv','')
    Predict('./data/REST_data_1_predicted_v2.csv',Target, xgbr,'./data/REST_data_1_predicted_v2.csv','(p2)')
    print("-----Prediction END-------")

First round prediction start for: REST: Staff size MOD
Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best parameters: {'alpha': 0, 'lambda': 1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best scores: -0.4712055346919227
MAPE CV Score: 0.346 (0.154)
-----Prediction END-------
First round prediction start for: REST: No Wtr MOD
Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best parameters: {'alpha': 0.5, 'lambda': 1, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Best scores: -0.31172121821608
MAPE CV Score: 0.222 (0.089)
-----Prediction END-------
First round prediction start for: REST: No Ktch Wkr MOD
Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best parameters: {'alpha': 0, 'lambda': 0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 600}
Best scores: -0.233649934857513
MAPE CV Score: 0.297 (0.193)
-----Prediction END-------
First round prediction start for: REST: No tables MOD
Fitting 2 folds for 

In [41]:
#third round
Target = "REST: Monthly Sales MOD"
print("Third round prediction start for:",Target)
x,y,xtrain, xtest, ytrain, ytest = SplitSet(Target,'./data/REST_data_1_predicted_v2_without_label.csv')
xgbr = XgbModel(xtrain,ytrain, x,y)
Predict('./data/REST_data_1_predicted_v2.csv',Target, xgbr,'./data/REST_data_1_predicted_v3.csv','(p3)')

print("-----Prediction END-------")

Targetlist = ["REST: Dividends MOD","REST: Monthly rent MOD","REST: Total Capital MOD"]
for Target in Targetlist:
    print("Third round prediction start for:",Target)
    x,y,xtrain, xtest, ytrain, ytest = SplitSet(Target,'./data/REST_data_1_predicted_v2_without_label.csv')
    xgbr = XgbModel(xtrain,ytrain, x,y)
    Predict('./data/REST_data_1_predicted_v3.csv',Target, xgbr,'./data/REST_data_1_predicted_v3.csv','(p3)')
    print("-----Prediction END-------")

Third round prediction start for: REST: Monthly Sales MOD
Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best parameters: {'alpha': 1, 'lambda': 1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best scores: -0.5222423824244933
MAPE CV Score: 0.576 (0.180)
-----Prediction END-------
Third round prediction start for: REST: Dividends MOD
Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best parameters: {'alpha': 1, 'lambda': 0.5, 'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 100}
Best scores: -0.6039485981551684
MAPE CV Score: 0.901 (0.249)
-----Prediction END-------
Third round prediction start for: REST: Monthly rent MOD
Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best parameters: {'alpha': 0, 'lambda': 1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best scores: -0.5886857927641571
MAPE CV Score: 0.721 (0.330)
-----Prediction END-------
Third round prediction start for: REST: Total Capital MOD
Fittin