In [None]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
places_df = pd.read_csv('data/final_all_feature_acc.csv')
places_df = places_df.drop(['Unnamed: 0','tumbol_name','amphur_name', 'province_name', 'amphur_ID', 'province_ID','TUMBOL_ID'], axis=1)
places_df.head()

In [None]:
places_df = places_df[['total_population', 'SusCo gas station', 'port, pier ', 'poverty_rate_accessibility']]
places_df

In [None]:
X = places_df.drop(['poverty_rate_accessibility'], axis=1).to_numpy()
y = places_df['poverty_rate_accessibility'].values.reshape(-1,1)
print(X.shape, y.shape)

In [None]:
# '''Normalize X using population'''
# for i in range(X.shape[1]):
    # X[:,i] = X[:,i] / places_df['total_population'].astype('float')
    # X[:,i] = X[:,i].astype('float') / 9.0
    # break

In [None]:
'''Train-test split'''
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
class log_scaler():
    def __init__(self, base=10, a=1):
        self.a = a
        self.base = base
        
    def transform(self, x):
        return np.log10(x+self.a) / np.log10(self.base)
        # return np.log(x+self.a)
    
    def inverse_transform(self, x):
        return np.power(self.base, x) - self.a
        # return np.exp(x) - self.a

In [None]:
'''Normalize y'''
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train = X_scaler.fit_transform(np.log(X_train+1e-3))
X_test = X_scaler.transform(np.log(X_test+1e-3))

# y_log_scaler = log_scaler()
# y_train = y_log_scaler.transform(y_train)
y_train = y_scaler.fit_transform(y_train)

In [None]:
'''Pipeline'''
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

regressor = CatBoostRegressor(
    # iterations=100, 
    # depth=16, 
    learning_rate=1, 
    loss_function='MAE')

regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))
# y_pred = y_log_scaler.inverse_transform(y_pred)

In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mae

In [None]:
places_df.columns

In [None]:
feat_imp_df = pd.DataFrame()
feat_imp_df['features'] = places_df.drop(['poverty_rate_accessibility'], axis=1).columns
feat_imp_df['importances'] = regressor.feature_importances_
feat_imp_df.sort_values('importances', ascending=False)[:50]

In [None]:
feat_imp_df.to_csv('acc_feat_imp.csv', index=False)

In [None]:
y_df = pd.DataFrame()
y_df['test'] = y_test[:,0]
# y_df['pred'] = y_pred[:,0]
y_df['pred'] = y_pred
y_df[:20]

In [None]:
y_df.plot()

In [None]:
plt.scatter(y_df.index, y_df['test'])
plt.scatter(y_df.index, y_df['pred'])

In [None]:
plt.hist(y_df['test'])
plt.hist(y_df['pred'])

In [None]:
y_test.max()

In [None]:
'''Correlation'''
import matplotlib.pyplot as plt

plt.figure(figsize = (20,20))
plt.imshow(places_df.corr())
plt.show()

In [None]:
pd.DataFrame(places_df.corr()['poverty_rate_accessibility'].abs().sort_values(ascending=False)).to_csv('acc_corr.csv')