# Import and Loading Data

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import seaborn as sns
from catboost import CatBoostClassifier
import pickle

%matplotlib inline

In [80]:
RESOLUTION = 8

In [81]:
data_8 = os.path.join(f"final_training_dataset.xlsx")
data = pd.read_excel(os.path.join(data_8), usecols=lambda x: 'Unnamed' not in x)
data_org = data.copy()
# first drop the h3, lat and lng attributes
data.drop(columns=['h3', 'lat', 'lng'], inplace=True)
# consider this as a classification problem
data['y'] = (data['y'] > 0).astype(int)
data = data.astype(int)

In [82]:
y = data['y']
data.pop('y')
from sklearn.model_selection import train_test_split
df = data.copy()
y_train =  y.copy()

In [83]:
df_test = pd.read_excel('test_data.xlsx',usecols=lambda x: 'Unnamed' not in x)
df_predict = df_test.copy()
df_predict.drop(columns=df.columns, inplace=True)

In [84]:
df_test = df_test.loc[:, df.columns]

In [85]:
assert (df.columns == df_test.columns).all()

# General Functions

In [86]:
# define a function to apply the same function on both train and test datasets
def apply(function, df_train, df_test, axis=1):
    df_train = df_train.apply(function, axis=axis)
    df_test = df_test.apply(function, axis=axis)
    return df_train, df_test

def drop(cols, df_train, df_test):
    if isinstance(cols, str):
        cols = [cols]
    return df_train.drop(columns=cols), df_test.drop(columns=cols)    

In [87]:
def divide(a, b):
    return a / b if b != 0 else 0

In [88]:
def pdf_feat(df, col):
    sns.displot(df, x=col, hue='y', palette='dark')
    plt.show()

In [89]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    X = X.copy()
    # discrete features are the ones with type int
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]

    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
def mi_scores(x, y):
    mi_scores = make_mi_scores(x, y)
    plot_mi_scores(mi_scores)
    plt.show()

# Transforming Data

In [90]:
for c in df.columns:
    if len(df[df[c] != 0]) == 0:
        print(c)

In [91]:
out_dict = {}
for c in df.columns:
    # for each column: filter by rows who do not have zero at that column and retrive the column
    q3, q1 = np.percentile(df[df[c] != 0][c], [75, 25])
    iqr = q3 - q1
    min_v = q1 - 1.5 * iqr
    max_v = q3 + 1.5 * iqr
    out_dict[c] = (min_v, max_v)


cols = df.columns
def fix_outliers(row):
    for c in cols:
        v = row[c]
        min_v, max_v = out_dict[c]
        if v != 0 and v > max_v:
            row[c] = max_v
        elif v != 0 and v < min_v:
            row[c] = min_v
    return row

df_t = df.apply(fix_outliers, axis=1)

In [92]:
bus_s = 'bus_count'
bus_f = 'bus_freq_count'
metro  = 'TotalPassengers'
def set_tranport(row):
    row['transport'] = (row[bus_s] + 2) * np.sqrt(np.log(row[bus_f] + 2) ) + row[metro]
    return row

a_a = 'accomodation_area'
a_c = 'accomodation_count'
pop = 'population'

def get_acc_feats(row):
    row['density'] = divide(row[pop], row[a_a])
    row['app_area'] = divide(row[a_a], row[a_c]) 
    row[a_a] / row[a_c] if row[a_c] != 0 else 0
    return row
# set accomodation and transport
df, df_test = apply(get_acc_feats, df, df_test)
df, df_test = apply(set_tranport, df, df_test)
df, df_test = drop(["count_highway_pedestrian", "length_highway_pedestrian"], df, df_test)


In [93]:
# df['y'] = y_train
# for c in df.loc[:, "app_area":].columns:
#     pdf_feat(df, c)
# df.pop('y')

### conclusions
The columns should receive different treatments:
1. boxcox transformation:
    * bus\_frequency\_count
    * education\_area
    * parking\_area
    * accomodation\_count
    * accomodation\_area
    * highway\_primary 
    * count\_h\_2
    * c\_h\_2
    * c\_h\_r
    * density
    * app\_area
2. discretization:
    * education\_count
    * fianacial\_count
    * commercial\_count
    * comercial\_area
    * health\_care\_count
    * sustenance\_count
    * l\_h\_2
    * l\_h\_3
    * l\_h\_r
3. nothing:
    * parking\_count
    * bus\_count
    * entertainment\_count
    * population
4. dropped:
    * financial\_area
    * health\_care\_area
    * entertainment\_area
    * sustenance\_area
    * sports, government
    * TotalPassengers

## Applying the necessary transformations

In [94]:
import scipy.stats as st
# first let's drop some columns:
drop_columns = ["financial_area", "health_care_area", "entertainment_area", "sustenance_area", 
                "sports_count", "sports_area", "government_area", "government_count", metro, bus_f]

box_cox_cols = ["education_area", "parking_area", "accomodation_count", "accomodation_area",
                 "count_highway_secondary", "length_highway_secondary",
                "count_highway_residential", "density", "app_area", "length_highway_tertiary"]
discretize_cols = ["count_highway_primary", "length_highway_primary", "education_count", "financial_count", "commercial_count", "commercial_area", "health_care_count", 
                   "sustenance_count", "length_highway_secondary", "length_highway_tertiary", "length_highway_residential"]

# drop
df, df_test = drop(drop_columns, df, df_test)


In [95]:
# transform with boxcox
for c in box_cox_cols:
    df[c] ,_params = st.boxcox(df[c] + 1)
    df_test[c] = st.boxcox(df_test[c] + 1, _params)            

In [96]:
# discretize
from feature_engine.discretisation import DecisionTreeDiscretiser
discretize_cols
disc = DecisionTreeDiscretiser(cv=3, param_grid={"max_depth": [2, 3], "random_state":[11]} , random_state=11, scoring='precision', variables=discretize_cols, regression=False)

# fit the transformer
df_t= disc.fit_transform(df, y_train)
df_test_t = pd.DataFrame(disc.transform(df_test), columns=df_test.columns)

In [97]:
assert (df_test_t.columns == df_t.columns).all()

In [98]:
df = df_t.copy()
df_test = df_test_t.copy()

In [99]:
df.shape, df_test.shape

((6054, 25), (4905, 25))

In [100]:
traffic = ["transport", "bus_count", 
           "parking_count", "parking_area", "count_highway_tertiary" , 
           "count_highway_residential", "length_highway_tertiary",
           "length_highway_residential","count_highway_secondary",
           "length_highway_secondary","length_highway_primary","count_highway_primary"]
population = [col for col in df.columns if col not in traffic]


In [101]:
mi_scores = make_mi_scores(df, y_train)

In [102]:
top_8 = list(mi_scores.index)[:9]
top_8.remove('accomodation_area')

In [103]:
# top_8 = ["accomodation_area"             
# ,"density"                       
# ,"accomodation_count"            
# ,"app_area"                      
# ,"transport"                     
# ,"bus_count"                     
# , "education_count"               
# , "population" ]                

In [104]:
traffic_cols = list(set(top_8).union(traffic)) 
population_cols = list(set(top_8).union(population))
print(traffic_cols)
print(population_cols)


['density', 'transport', 'parking_area', 'education_area', 'app_area', 'population', 'bus_count', 'count_highway_tertiary', 'count_highway_residential', 'count_highway_secondary', 'length_highway_secondary', 'count_highway_primary', 'length_highway_residential', 'length_highway_primary', 'length_highway_tertiary', 'parking_count', 'accomodation_count']
['education_count', 'entertainment_count', 'density', 'transport', 'education_area', 'app_area', 'population', 'bus_count', 'financial_count', 'accomodation_area', 'commercial_area', 'commercial_count', 'sustenance_count', 'health_care_count', 'parking_count', 'accomodation_count']


In [105]:
df_traffic = df.loc[:, traffic_cols]
df_test_traffic = df_test.loc[:, traffic_cols]

df_pop = df.loc[:, population_cols]
df_test_pop = df_test.loc[:, population_cols]

In [106]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
def apply_PCA(df, df_test):
    ss = StandardScaler()
    pca = PCA(n_components=0.95)

    df_s = pd.DataFrame(ss.fit_transform(df), columns=df.columns)
    df_test_s = pd.DataFrame(ss.transform(df_test), columns=df_test.columns)

    df_p = pd.DataFrame(pca.fit_transform(df_s))
    df_test_p = pd.DataFrame(pca.transform(df_test_s))

    df = df_p.copy()
    df_test = df_test_p.copy()
    return df, df_test
    

In [107]:
df, df_test = apply_PCA(df, df_test)

In [108]:
df_pop, df_test_pop = apply_PCA(df_pop, df_test_pop)
df_traffic, df_test_traffic = apply_PCA(df_traffic, df_test_traffic)

In [109]:
# best_model = pickle.load(open("best_model.sav", 'rb'))
# print(best_model)

# Predicting

In [110]:
best_model = pickle.load(open(os.path.join("models","XGB.sav"), 'rb'))
print(best_model)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.5, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=600,
              n_jobs=1, nthread=1, num_parallel_tree=1, predictor='auto',
              random_state=42, reg_alpha=0, ...)


In [112]:
def predict_save(df, y_train, df_test, model_name, file_name):
    best_model.fit(df, y_train)
    y_pred = best_model.predict_proba(df_test)
    predictions = (y_pred[:, 1].tolist())
    
    df_p = df_predict.copy()
    df_p['rating'] = [p * 100 for p in predictions]
    df_p['model'] = [model_name for _ in predictions]
    df_p = df_p.sort_values(by='rating')
    
    new_index = pd.Index(list(range(len(df_p))))
    df_p = df_p.set_index(new_index)
    df_p = df_p.reset_index() 
    df_p = df_p.rename(columns={"index":"rank"}) 
    df_p['rank'] = df_p['rank'] + 1
    df_p.to_excel(f"{file_name}.xlsx")
    

In [113]:
predict_save(df, y_train, df_test, "main", "main")
predict_save(df_pop, y_train, df_test_pop, "Population", "Population")
predict_save(df_traffic, y_train, df_test_traffic, "Traffic", "Traffic")

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


