In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv('./GlobalLandTemperaturesByCity.csv')
df.dropna(inplace=True)

In [3]:
def convert(tude):
    multiplier = 1 if tude[-1] in ['N', 'E'] else -1
    return multiplier * sum(float(x) / 60 ** n for n, x in enumerate(tude[:-1].split('-')))

In [4]:
df['Latitude'] = df['Latitude'].apply(lambda x: convert(x))
df['Longitude'] = df['Longitude'].apply(lambda x: convert(x))
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05,10.33
5,1744-04-01,5.788,3.624,Århus,Denmark,57.05,10.33
6,1744-05-01,10.644,1.283,Århus,Denmark,57.05,10.33
7,1744-06-01,14.051,1.347,Århus,Denmark,57.05,10.33
8,1744-07-01,16.082,1.396,Århus,Denmark,57.05,10.33


In [5]:
df['dt'] = df['dt'].apply(lambda x: int(x.split('-')[0]))
res = df.groupby(by=['dt', 'Latitude', 'Longitude'])
res

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x13dd1c990>

In [6]:
big = res.filter(lambda x: len(x) > 6) # At least 6 records per year
res = big.groupby(by=['dt', 'Latitude', 'Longitude'])['AverageTemperature'].mean().reset_index()
res = res[res['dt'] >= 1945]

In [7]:
res['year'] = res['dt']
res['latitude'] = res['Latitude']
res['longitude'] = res['Longitude']
res['temperature'] = res['AverageTemperature']
del res['dt']
del res['Latitude']
del res['Longitude']
del res['AverageTemperature']
res.head()

Unnamed: 0,year,latitude,longitude,temperature
172396,1945,-52.24,-70.95,6.64025
172397,1945,-45.81,-68.08,11.689167
172398,1945,-45.81,169.62,7.0655
172399,1945,-44.2,172.17,9.797083
172400,1945,-42.59,-65.45,14.03075


In [8]:
res.to_json('temperatures_by_city.json', orient='records')

In [9]:
df = res
df = df.round({'latitude': 1, 'longitude': 1})
df.head()

Unnamed: 0,year,latitude,longitude,temperature
172396,1945,-52.2,-71.0,6.64025
172397,1945,-45.8,-68.1,11.689167
172398,1945,-45.8,169.6,7.0655
172399,1945,-44.2,172.2,9.797083
172400,1945,-42.6,-65.4,14.03075


In [10]:
len(df), len(df.drop_duplicates(['year', 'latitude', 'longitude']))

(94254, 94254)

In [11]:
cities = pd.read_csv('./worldcitiespop.csv')
cities.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Country,City,AccentCity,Region,Population,Latitude,Longitude
0,ad,aixas,Aixàs,6,,42.483333,1.466667
1,ad,aixirivali,Aixirivali,6,,42.466667,1.5
2,ad,aixirivall,Aixirivall,6,,42.466667,1.5
3,ad,aixirvall,Aixirvall,6,,42.466667,1.5
4,ad,aixovall,Aixovall,6,,42.466667,1.483333


In [12]:
loc = cities[['Latitude', 'Longitude']].copy()
loc['latitude'] = loc['Latitude']
loc['longitude'] = loc['Longitude']
del loc['Latitude']
del loc['Longitude']

loc = loc.round({'latitude': 1, 'longitude': 1})
loc['x'] = loc['latitude']
loc['y'] = loc['longitude']
loc = loc.round({'x': 0, 'y': 0})
loc.drop_duplicates(['x', 'y'], inplace=True)
del loc['x']
del loc['y']

# Randomly remove some locations for performance
drop_indices = np.random.choice(loc.index, int(len(loc) * 0.5), replace=False)
loc.drop(drop_indices, inplace=True)

print(len(loc))
loc.head()

6942


Unnamed: 0,latitude,longitude
94,25.3,55.3
100,25.3,55.9
103,24.2,55.8
120,24.3,53.2
178,23.0,53.4


In [13]:
known_locations = df.drop_duplicates(['latitude', 'longitude'])
len(known_locations), len(loc)

(1366, 6942)

In [14]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(known_locations[['latitude', 'longitude']].values, np.zeros(len(known_locations)))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [15]:
res = []
for i, location in tqdm(loc.iterrows(), total=len(loc)):
    lat, lng = location['latitude'], location['longitude']
    dist, ids = knn.kneighbors(X=[(lat, lng)], n_neighbors=3)
    known = known_locations.iloc[ids.flatten().tolist()]
    
    real = df[df['latitude'].isin(known['latitude']) & df['longitude'].isin(known['longitude'])]
    real = real.groupby(by=['year'])['temperature'].mean().reset_index()
    real['latitude'] = lat
    real['longitude'] = lng
    res.append(real)

HBox(children=(IntProgress(value=0, max=6942), HTML(value='')))




In [16]:
res = pd.concat(res)
res = pd.concat([df, res], sort=False)
res = res.drop_duplicates(['year', 'latitude', 'longitude'])
res.to_json('temperatures_by_city.json', orient='records')

In [17]:
len(res), len(df)

(572769, 94254)