In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from gc import collect

import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir("../Data"))

['.DS_Store', 'measurements.csv']


In [2]:
# Filtering and cleaning rows as we go and using only Captured Time, Latitude, Longitude, Value, and Unit cols
chunksize = 10 ** 6
chunk_list = []
for chunk in pd.read_csv('../Data/measurements.csv', usecols=[0, 1, 2, 3, 4, 8], chunksize=chunksize):
    chunk = chunk[chunk['Unit'] == 'cpm']
    chunk.value = chunk.Value/350
    chunk = chunk[chunk.Value > 0]
    chunk['year'] = pd.to_datetime(chunk['Captured Time'],  errors = 'coerce').dt.to_period('Y')
    chunk.dropna(axis=0, how='any', inplace=True)
    chunk['year'] = chunk['year'].astype('str').astype('int')
    chunk = chunk[chunk['year'] <= 2020]
    chunk = chunk.sample(n=12000)
    chunk_list.append(chunk)
    collect()
    if chunk.index[0] > 20000000:
        break
df = pd.concat(chunk_list)

del(chunk_list)
collect()

0

In [3]:
# Number of measurements
print('Number of measurements: ', df.shape[0])

Number of measurements:  252000


In [4]:
df = df.drop(columns=['Unit'])
# Renaming columns
df.columns = ['time', 'lat', 'lon', 'value', 'height', 'year']


In [5]:
df.head()

Unnamed: 0,time,lat,lon,value,height,year
639910,2017-11-25 21:55:10,38.40362,-122.81847,38.0,65.0,2017
562963,2017-11-26 18:20:54,35.659491,139.72785,26.0,92.0,2017
242151,2017-12-03 00:28:16,37.64444,140.797711,116.0,426.0,2017
672283,2017-11-25 03:20:28,37.796306,140.514413,63.0,65.0,2017
909970,2017-11-19 16:02:00,37.64444,140.797711,43.0,426.0,2017


In [6]:
X = df.iloc[:, [1, 2]].values  # lat, lon, height and year
Y = df.iloc[:, [3]].values  # value

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [23]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(max_features=1,
                                  n_estimators=180, 
                                  random_state=20,
                                 )
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)

In [24]:
from sklearn import metrics

In [25]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(Y_test, y_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(Y_test, y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))

Mean Absolute Error:  15.538676451078095
Mean Squared Error:  388.66007964391736
Root Mean Squared Error:  19.71446371687339


In [26]:
print("The accuracy: ", metrics.r2_score(Y_test, y_pred))

The accuracy:  0.5971864168894585


In [28]:
import pickle
pkl_filename = "pickle_model.pkl"

In [30]:
# Save the model
with open(pkl_filename, 'wb') as file:
    pickle.dump(regressor, file)