In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Import my modules.
import sys, os
from pathlib import Path
current_dir = os.path.join(Path().resolve())
sys.path.append(str(current_dir) + '/../')

from modules import utils
from modules import models
from modules import preprocess

import importlib
for m in [utils, models, preprocess]:
    importlib.reload(m)

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
mbd = 'microbusiness_density'

df_train, df_test, df_subm = utils.load_dataset()
df_all, df_census = utils.merge_dataset(df_train, df_test, pop=False, unemploy=False, census=False, coord=True, fix_pop=True, outlier=False)

In [None]:
!pip install -qq reverse_geocoder

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from haversine import haversine
from umap import UMAP

In [None]:
coordinates = df_all[['lng', 'lat']].values

# Encoding tricks
emb_size = 20
precision = 1e6

latlon = np.expand_dims(coordinates, axis=-1)

m = np.exp(np.log(precision)/emb_size)
angle_freq = m ** np.arange(emb_size)
angle_freq = angle_freq.reshape(1,1, emb_size)
latlon = latlon * angle_freq
latlon[..., 0::2] = np.cos(latlon[..., 0::2])

In [None]:
import reverse_geocoder as rg

coordinates = list(zip(df_all['lat'], df_all['lng']))
results = rg.search(coordinates)
df_all['place'] = [x['admin2'] for x in results]

places = list(np.unique(df_all['county'].values))

def replace(x):
    if x in places:
        return x
    
    else:
        return 'Other'
    
df_all['place'] = df_all['place'].apply(lambda x: replace(x))

le = LabelEncoder()
df_all['place'] = le.fit_transform(df_all['place'])

pca = PCA().fit(coordinates)
df_all['pca_lat'] = pca.transform(coordinates)[:, 0]
df_all['pca_lon'] = pca.transform(coordinates)[:, 1]

umap = UMAP(n_components=2,
           n_neighbors=50,
           random_state=2023).fit(coordinates)

df_all['umap_lat'] = umap.transform(coordinates)[:, 0]
df_all['umap_lon'] = umap.transform(coordinates)[:, 1]

In [None]:
def rot(df):
    for angle in [15, 30, 45]:
        df[f'rot_{angle}_x'] = (np.cos(np.radians(angle)) * df['lat']) + \
                                (np.sin(np.radians(angle)) * df['lng'])
        
        df[f'rot_{angle}_y'] = (np.cos(np.radians(angle)) * df['lat']) - \
                                (np.sin(np.radians(angle)) * df['lng'])
        
    return df

df_all = rot(df_all)