### Import relevant libraries

In [1]:
import pandas as pd
from dateutil import tz
from datetime import datetime, timedelta
import mplleaflet

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.geometry import Point

### Load shapefiles of Kuala Lumpur

In [2]:
wpkl = gpd.read_file('result/09-WPKL-New-DM.shp')
wpkl = wpkl.to_crs({'init' :'epsg:4326'})

cols_retain = ['PAR_BARU',
    'DM_BARU',
    'geometry',
    ]

current_cols = wpkl.columns.values

cols_to_drop = list(set(current_cols) - set(cols_retain))

wpkl = wpkl.drop(cols_to_drop, axis=1)

wpkl['origin_count'] = 0
wpkl['destination_count'] = 0

wpkl.head()

Unnamed: 0,PAR_BARU,DM_BARU,geometry,origin_count,destination_count
0,KEPONG,PEKAN KEPONG,"POLYGON ((101.6454217755897 3.218750667079842,...",0,0
1,KEPONG,KAMPONG MELAYU KEPONG,"POLYGON ((101.6560859902314 3.224697689434247,...",0,0
2,KEPONG,JINJANG TEMPATAN KEDUA,"POLYGON ((101.6629725705848 3.219366439092332,...",0,0
3,KEPONG,JINJANG TEMPATAN PERTAMA,"POLYGON ((101.6599515959298 3.224439884906962,...",0,0
4,KEPONG,JINJANG TEMPATAN KETIGA,"POLYGON ((101.6595923570077 3.219603131907105,...",0,0


### Data Cleaning

In [3]:
def DataCleaning(user):
    df = pd.read_csv(user, header=None)
    df.columns = ['index_del', 'Bounding Box', 'Coordinates', 'DateTime', 'Geo Enabled',
                  'del', 'Language', 'Name', 'Tweet', 'Username']
    df = df.drop(['index_del', 'del', 'Bounding Box', 'Geo Enabled', 'Language'], axis=1)
    df = df[df['Coordinates'] != 'not shown']
    df = df.sort_values('DateTime') 
    df = df.drop_duplicates()
    df = df.dropna()

    df["Day of the Week"] = ""
    df["Time"] = ""
    df["Date"] = ""

    for i in range(len(df.DateTime)):
        # auto-detect timezones
        from_zone = tz.tzutc()
        to_zone = tz.tzlocal()
        df.DateTime.iloc[i] = datetime.strptime(str(df.DateTime.iloc[i]), '%a %b %d %H:%M:%S %z %Y')
        df.DateTime.iloc[i] = df.DateTime.iloc[i].replace(tzinfo=from_zone)
        df.DateTime.iloc[i] = df.DateTime.iloc[i].astimezone(to_zone)

        df['Day of the Week'].iloc[i] = df.DateTime.iloc[i].weekday()
        df['Time'].iloc[i] = df.DateTime.iloc[i].strftime('%H:%M:%S')
        df['Date'].iloc[i] = df.DateTime.iloc[i].strftime('%d %B %Y')

    latitudes = []
    longitudes = []
    for i in df['Coordinates']:
        i = list(i[1:-1].split(', '))
        latitudes.append(i[0])
        longitudes.append(i[1])

    se = pd.Series(latitudes)
    df['Latitudes'] = se.values
    se = pd.Series(longitudes)
    df['Longitudes'] = se.values
    
    df['Latitudes'] = pd.to_numeric(df['Latitudes'])
    df['Longitudes']= pd.to_numeric(df['Longitudes'])
    
    df['geometry'] = list(zip(df['Longitudes'], df['Latitudes']))

    df['geometry'] = df['geometry'].apply(Point)

    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    gdf.crs = {'init' :'epsg:4326'}
    
    cols_retain = ['DateTime',
                    'Name',
                    'Username',
                    'Time',
                    'Date',
                    'geometry'
                  ]

    current_cols = gdf.columns.values

    cols_to_drop = list(set(current_cols) - set(cols_retain))

    gdf = gdf.drop(cols_to_drop, axis=1)

    return gdf.head()

In [4]:
'''
List of user csv files
'''
users = ['''LIST OF USER CSV FILES''']

### Origin + Destination Counting

In [6]:
not_masjid_jamek_gdf = wpkl.drop([116, 117, 119, 120, 121, 122, 123, 124])
downtown = wpkl.iloc[[116, 117, 119, 120, 121, 122, 123, 124]]

def destination_count(users):
    for user in users:
        cleaned_data = DataCleaning(user)
        if len(cleaned_data) > 1:
            for i in range(len(cleaned_data)-1):
                current_point = cleaned_data['geometry'].iloc[i]
                next_point = cleaned_data['geometry'].iloc[i+1]
                if cleaned_data["Date"].iloc[i]==cleaned_data["Date"].iloc[i+1]:
                    for i in range(len(downtown)):
                        if downtown['geometry'].iloc[i].contains(current_point):
                            for i in range(len(not_masjid_jamek_gdf)):
                                if not_masjid_jamek_gdf['geometry'].iloc[i].contains(next_point):
                                    not_masjid_jamek_gdf['destination_count'].iloc[i] += 1
                                    
def origin_count(users):
    for user in users:
        cleaned_data = DataCleaning(user)
        if len(cleaned_data) > 1:
            for i in range(len(cleaned_data)-1):
                current_point = cleaned_data['geometry'].iloc[i]
                next_point = cleaned_data['geometry'].iloc[i+1]
                if cleaned_data["Date"].iloc[i]==cleaned_data["Date"].iloc[i+1]:
                    for i in range(len(downtown)):
                        if downtown['geometry'].iloc[i].contains(next_point):
                            for i in range(len(not_masjid_jamek_gdf)):
                                if not_masjid_jamek_gdf['geometry'].iloc[i].contains(current_point):
                                    not_masjid_jamek_gdf['origin_count'].iloc[i] += 1

In [7]:
destination_count(users)
origin_count(users)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
f, ax = plt.subplots(1, figsize=(12, 12))
ax = not_masjid_jamek_gdf.plot(column='destination_count', ax=ax)
mplleaflet.display(fig=f, crs=wpkl.crs, tiles='cartodb_positron')

In [12]:
f, ax = plt.subplots(1, figsize=(12, 12))
ax = not_masjid_jamek_gdf.plot(column='origin_count', ax=ax)
mplleaflet.display(fig=f, crs=wpkl.crs, tiles='cartodb_positron')