In [1]:
import osmnx as ox
import branca.colormap as cm
import folium
from folium import plugins

import networkx as nx
from shapely import wkt


from folium import PolyLine, Map, plugins, Marker
from folium.plugins import MousePosition
import random
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import time
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from datetime import datetime as dt
%matplotlib inline



In [3]:
df = pd.read_csv('../data/stupino_locs.csv')
interests = pd.read_csv('../data/stupino_interests.csv')

In [22]:
MIN_POINTS_ID = 10
unique_id = pd.unique(df['id'])
users = unique_id[df[['id']].value_counts()> MIN_POINTS_ID]
# users = unique_id[df[['id', 'dt']].groupby('id').count()['dt'] > MIN_POINTS_ID]
latlon = df[['id', 'lat', 'lon']][df['id'].isin(users)]

In [23]:
users.shape

(55385,)

In [11]:
df[['id']].value_counts()

id    
4450      7015
4127      5969
416       5595
1628      5450
3209      5267
          ... 
109697       1
109698       1
109699       1
109701       1
151890       1
Name: count, Length: 151891, dtype: int64

In [39]:
users_clusters_df = pd.DataFrame({'id' : [], 'lat' : [], 'lon' : []})
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

def add2map(num, bs_pt, cell_nm, icon=True, color='red', group=None):
    if icon:
        icon_obj = plugins.BeautifyIcon(
            icon='arrow-down', icon_shape='marker',
            border_color=color, text_color=color,
            number=cell_nm, inner_icon_style='margin-top:0;'
        )
    else:
        icon_obj = None
    marker = Marker(location=[bs_pt[0], bs_pt[1]], tooltip='id:' + str(cell_nm),
                    popup=str(cell_nm), icon=icon_obj)
    if group:
        marker.add_to(group)
    else:
        marker.add_to(basic_map)

MIN_SAMPLES = 10
MAX_DIST_M = 30
ms_per_radian = 6371008.8
USER_ID = range(10)
DRAW_MAP = True
eps_rad = MAX_DIST_M / ms_per_radian
epsilon = eps_rad
x = 'lon'
y = 'lat'
city_center = df[['lat', 'lon']].mean()
latlon_pair = [city_center.lat, city_center.lon]
if DRAW_MAP:
    basic_map = Map(
        location=latlon_pair,
        zoom_start=12,
        tiles='OpenStreetMap',
        control_scale=True,
        prefer_canvas=True,
        )
else:
    basic_map = None

    # Measure_Control = plugins.MeasureControl(
    #     position='topright',
    #     primary_length_unit='meters', 
    #     primary_area_unit='sqmeters', 
    # )
start_global_time = time.time()
for cur_user in USER_ID:
    df1 = latlon[['lat', 'lon']][df['id'] == cur_user]
    start_time = time.time()
    coords = df1[[y, x]].values  
    db = DBSCAN(eps=epsilon, min_samples=MIN_SAMPLES, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))

    clusters = pd.Series([coords[cluster_labels==n] for n in range(num_clusters)])
    
    if clusters.shape[0] > 1:
        clusters = clusters[:-1]
        centermost_points = clusters.map(get_centermost_point)

        lats, lons = zip(*centermost_points)
        rep_points = pd.DataFrame({'id':cur_user, x:lons, y:lats})
        rs = pd.DataFrame({'id':cur_user, x:rep_points['lon'], y:rep_points['lat']})

    else:
        rep_points = df1.mean()
        rs = pd.DataFrame({'id':cur_user, x:[df1['lon'].mean()], y:[df1['lat'].mean()]})
        
    users_clusters_df = pd.concat([users_clusters_df, rs], ignore_index=True)
    message = 'user id: {:,}\t{:,} points -> {:,} cluster(s); {:,.2f} s.'
    print(message.format(cur_user, len(df1), len(rs), time.time()-start_time))
    
    if DRAW_MAP:
        map_clusters = folium.FeatureGroup(name="clusters of id  " + str(cur_user), show=False)
        basic_map.add_child(map_clusters)
        for index, row in rs.iterrows():
            lat, lon = row['lat'], row['lon']
            add2map(index, [lat, lon], cur_user, group=map_clusters)

        all_locations = folium.FeatureGroup(name="locs of id " + str(cur_user), show=False)
        basic_map.add_child(all_locations)
        for index, row in df[df['id'] == cur_user].iterrows():
            lat, lon = row['lat'], row['lon']
            add2map(index, [lat, lon], cur_user, icon=False, group=all_locations)


if DRAW_MAP:
    folium.LayerControl().add_to(basic_map);
    MousePosition().add_to(basic_map);
    # basic_map.add_child(Measure_Control)
message = "running time: {:,.2f} s"
print(message.format(time.time() - start_global_time))

    

user id: 0	910 points -> 5 cluster(s); 0.05 s.
user id: 1	781 points -> 3 cluster(s); 0.05 s.
user id: 2	1,358 points -> 4 cluster(s); 0.10 s.
user id: 3	396 points -> 4 cluster(s); 0.02 s.
user id: 4	349 points -> 1 cluster(s); 0.00 s.
user id: 5	304 points -> 4 cluster(s); 0.01 s.
user id: 6	968 points -> 4 cluster(s); 0.05 s.
user id: 7	592 points -> 10 cluster(s); 0.03 s.
user id: 8	604 points -> 2 cluster(s); 0.04 s.
user id: 9	317 points -> 2 cluster(s); 0.02 s.
running time: 1.87 s


In [40]:
users_clusters_df.to_csv("clusters.csv", index=False);

In [41]:
basic_map

In [42]:
with pd.option_context("display.max_rows", 1000):
    users_interests = interests[interests['id'].isin(USER_ID)].copy()
    users_interests.index = users_interests['id']
    users_interests.drop(['id'], axis=1, inplace=True)
    users_interests.sort_index(inplace=True)
users_interests.T.to_csv('selected_users_interests.csv')