In [23]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
from sklearn.datasets import fetch_california_housing
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error as mse
from IPython.display import display
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgbm

# Loading data and concatenating with original

In [2]:
# setting a base_dir so we don't have to type all the paths
BASE_DIR = Path("/kaggle/input/playground-series-s3e1/")

In [3]:
train_df = pd.read_csv(BASE_DIR / "train.csv")
test_df = pd.read_csv(BASE_DIR / "test.csv")

original_df = fetch_california_housing(as_frame=True)
original_df = original_df["frame"]

train_large = pd.concat([train_df, original_df], axis=0).drop_duplicates().reset_index(drop=True)

# Some preprocessing

In [4]:
cols_to_use = train_large.columns[1:-1]
display(cols_to_use)
display(len(cols_to_use))

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

8

In [5]:
train_large.drop(columns=["id"], axis=1, inplace=True)

In [6]:
test_df.drop(columns=["id"], axis=1, inplace=True)

In [7]:
train_large.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [8]:
df = pd.concat([train_large, test_df], axis=0, ignore_index=True)

# Feature Engineering

Copying all the feature engineering from https://www.kaggle.com/code/dmitryuarov/ps-s3e1-coordinates-key-to-victory

## Encoding Trick

In [9]:
emb_size = 20
precision = 1e6 

latlon = np.expand_dims(df[['Latitude', 'Longitude']].values, axis=-1) 

# display(latlon)
# display(latlon.shape)

m = np.exp(np.log(precision) / emb_size) 

angle_freq = m ** np.arange(emb_size) 
angle_freq = angle_freq.reshape(1, 1, emb_size) 

latlon = latlon * angle_freq 
latlon[..., 0::2] = np.cos(latlon[..., 0::2]) 
latlon[..., 1::2] = np.sin(latlon[..., 1::2]) 
latlon = latlon.reshape(-1, 2 * emb_size) 

In [10]:
df['exp_latlon1'] = [lat[0] for lat in latlon]
df['exp_latlon2'] = [lat[1] for lat in latlon]

## Coords with PCA & UMAP

In [11]:
from sklearn.decomposition import PCA

coordinates = df[['Latitude', 'Longitude']].values
pca = PCA().fit(coordinates)

df['pca_lat'] = pca.transform(coordinates)[:,0]
df['pca_lon'] = pca.transform(coordinates)[:,1]

In [12]:
from umap import UMAP
umap = UMAP(n_components=2, n_neighbors=50, random_state=228).fit(coordinates)
df['umap_lat'] = umap.transform(coordinates)[:,0]
df['umap_lon'] = umap.transform(coordinates)[:,1]

## Cartesian Coords Rotation

In [21]:
df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + \
                  (np.sin(np.radians(15)) * df['Latitude'])
    
df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + \
                  (np.sin(np.radians(15)) * df['Longitude'])
    
df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + \
                  (np.sin(np.radians(30)) * df['Latitude'])
    
df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + \
                  (np.sin(np.radians(30)) * df['Longitude'])
    
df['rot_45_x'] = (np.cos(np.radians(44)) * df['Longitude']) + \
                  (np.sin(np.radians(45)) * df['Latitude'])

## Coords Location

In [14]:
!pip install reverse_geocoder

Collecting reverse_geocoder
  Downloading reverse_geocoder-1.5.1.tar.gz (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: reverse_geocoder
  Building wheel for reverse_geocoder (setup.py) ... [?25ldone
[?25h  Created wheel for reverse_geocoder: filename=reverse_geocoder-1.5.1-py3-none-any.whl size=2268088 sha256=b66242bb35c177aab76b468e0d0e5b72ede0a1f5b7fdce9dcc0ebaed3034262d
  Stored in directory: /root/.cache/pip/wheels/34/6e/70/5423639428a2cac8ea7eb467214a4254b549b381f306a9c790
Successfully built reverse_geocoder
Installing collected packages: reverse_geocoder
Successfully installed reverse_geocoder-1.5.1
[0m

In [15]:
import reverse_geocoder as rg

coordinates = list(zip(df['Latitude'], df['Longitude']))
results = rg.search(coordinates)
df['place'] = [x['admin2'] for x in results]

places = ['Los Angeles County', 'Orange County', 'Kern County',
          'Alameda County', 'San Francisco County', 'Ventura County',
          'Santa Clara County', 'Fresno County', 'Santa Barbara County',
          'Contra Costa County', 'Yolo County', 'Monterey County',
          'Riverside County', 'Napa County']

def replace(x):
    if x in places:
        return x
    else:
        return 'Other'
    
df['place'] = df['place'].apply(lambda x: replace(x))
le = LabelEncoder()
df['place'] = le.fit_transform(df['place'])

Loading formatted geocoded file...


# Distance to Cities and Coast Lines

In [16]:
from haversine import haversine

Sac = (38.576931, -121.494949)
SF = (37.780080, -122.420160)
SJ = (37.334789, -121.888138)
LA = (34.052235, -118.243683)
SD = (32.715759, -117.163818)

df['dist_Sac'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), Sac, unit='ft'), axis=1)
df['dist_SF'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SF, unit='ft'), axis=1)
df['dist_SJ'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SJ, unit='ft'), axis=1)
df['dist_LA'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), LA, unit='ft'), axis=1)
df['dist_SD'] = df.apply(lambda x: haversine((x['Latitude'], x['Longitude']), SD, unit='ft'), axis=1)
df['dist_nearest_city'] = df[['dist_Sac', 'dist_SF', 'dist_SJ', 
                              'dist_LA', 'dist_SD']].min(axis=1)

In [17]:
from shapely.geometry import LineString, Point

coast_points = LineString([(32.6644, -117.1613), (33.2064, -117.3831),
                           (33.7772, -118.2024), (34.4634, -120.0144),
                           (35.4273, -120.8819), (35.9284, -121.4892),
                           (36.9827, -122.0289), (37.6114, -122.4916),
                           (38.3556, -123.0603), (39.7926, -123.8217),
                           (40.7997, -124.1881), (41.7558, -124.1976)])

df['dist_to_coast'] = df.apply(lambda x: Point(x['Latitude'], x['Longitude']).distance(coast_points), axis=1)

In [18]:
# saving the dataset to save time for futher experiments
df.to_csv("playground-s03e01-ultimate.csv", index=False)

# Preprocessing v2 - Could be final

In [19]:
train = df.iloc[:-len(test_df),:]
test = df.iloc[-len(test_df):,:].drop('MedHouseVal', axis=1).reset_index(drop=True)

X = train.drop('MedHouseVal', axis=1)
y = train['MedHouseVal']

# Modeling

## First, let's train the LightGBM model using tuned params

In [20]:
lgbm_params = {'n_estimators': 10000,
 'num_rounds': 206,
 'learning_rate': 0.2296610244279599,
 'num_leaves': 600,
 'max_depth': 11,
 'min_data_in_leaf': 500,
 'lambda_l1': 10,
 'lambda_l2': 10,
 'min_gain_to_split': 0.05042836322143955,
 'bagging_fraction': 0.7,
 'bagging_freq': 1,
 'feature_fraction': 0.5}

In [26]:
model = lgbm.LGBMRegressor(objective="regression", **lgbm_params)
model.fit(X, y)





LGBMRegressor(bagging_fraction=0.7, bagging_freq=1, feature_fraction=0.5,
              lambda_l1=10, lambda_l2=10, learning_rate=0.2296610244279599,
              max_depth=11, min_data_in_leaf=500,
              min_gain_to_split=0.05042836322143955, n_estimators=10000,
              num_leaves=600, num_rounds=206, objective='regression')

In [27]:
y_pred_test = model.predict(test)
y_pred_test

array([0.67861983, 0.99859168, 3.78281902, ..., 1.25010114, 3.57119255,
       3.54106872])

In [28]:
# since we dropped the id column of the first test df and we need that for submissio file
test_df_2 = pd.read_csv(BASE_DIR / "test.csv")

In [29]:
submission_df = pd.DataFrame(data={'id': test_df_2.id, 'MedHouseVal': y_pred_test})
submission_df.MedHouseVal.clip(0, 5, inplace=True)

In [30]:
submission_df.to_csv("submission.csv", index=False)