# Plotting geotagged Tweets on a map

### Libraries required

In [1]:
import numpy as np
import pandas as pd
from dateutil import tz
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from datetime import datetime, timedelta

%matplotlib inline

### Data loading and cleaning

In [2]:
'''
Loads the data and introduces column names
'''
df = pd.read_csv('results.csv', names = ['index_del', 'Bounding Box', 'Coordinates', 'del', 'DateTime', 'Geo Enabled', 'Language', 'Name', 'Tweet', 'Username', 'del_1', 'del_2', 'del_3', 'del_4', 'del_5'])

'''
Drops irrelevant columns from the dataframe
'''
df = df.drop(['index_del', 'del', 'Bounding Box', 'Geo Enabled', 'Language', 'Tweet', 'Name', 'del_1', 'del_2', 'del_3', 'del_4', 'del_5'], axis=1)
df = df.drop_duplicates()
print('Number of data points collected: ', len(df))
df_coord = df[df['Coordinates'] != 'not shown']
print('Number of data points geotagged: ', len(df_coord))
df_coord = df_coord.sort_values('DateTime')


print('Number of data points with missing values: ', df_coord.isnull().sum().sum())
df_coord.dropna(inplace = True)

'''
Introduce and reformat DateTime data for easy manipulation later as well as reconfigures timezone to local time zone
'''
df_coord["Day of the Week"] = ""
df_coord["Time"] = ""
    
for i in range(len(df_coord.DateTime)):
    df_coord.DateTime.iloc[i] = datetime.strptime(str(df_coord.DateTime.iloc[i]), '%a %b %d %H:%M:%S %z %Y')

    # auto-detect timezones
    from_zone = tz.tzutc()
    to_zone = tz.gettz('Asia/Kuala Lumpur')
    df_coord.DateTime.iloc[i] = df_coord.DateTime.iloc[i].replace(tzinfo=from_zone)
    df_coord.DateTime.iloc[i] = df_coord.DateTime.iloc[i].astimezone(to_zone)
        
    df_coord['Day of the Week'].iloc[i] = df_coord.DateTime.iloc[i].weekday()

    df_coord['Time'].iloc[i] = df_coord.DateTime.iloc[i].strftime('%H:%M:%S')

KeyboardInterrupt: 

### Splitting data according to time

In [None]:
'''
Organises data first into weekday and weekends and then splits it into work and non-work hours
'''
# Creates dataframes to identify tweets on weekdays versus weekends
df_weekday = df_coord[df_coord['Day of the Week'] < 5]
df_weekend = df_coord[df_coord['Day of the Week'] > 4]

# Creates dataframes to identify tweets during and outside of workhours
df_workhours = df_weekday[(df_weekday['Time'] > '09:00:00') & (df_weekday['Time'] < '18:00:00')]
df_non_workhours = df_weekday[(df_weekday['Time'] < '09:00:00') & (df_weekday['Time'] > '18:00:00')]
df_non_workhours = pd.concat([df_non_workhours, df_weekend], ignore_index=True)

'''
Identifies the number of tweets collected for each dataframe
'''
print('Tweets during workhours: ', len(df_workhours))
print('Tweets outside workhours: ', len(df_non_workhours))

### Plotting the data

Certain sections have been commented out since only one data frame is being plotted here.

In [None]:
'''
Identifies the datasets being examined
'''
df_one = df_non_workhours
# df_two = df_workhours

'''
Creates a latitude and longitude dataframe column for df_one that will be used for plotting
'''
latitudes = []
longitudes = []
for i in df_one['Coordinates']:
    i = list(i[1:-1].split(', '))
    latitudes.append(i[0])
    longitudes.append(i[1])
    
se = pd.Series(latitudes)
df_one['Latitudes'] = se.values
se = pd.Series(longitudes)
df_one['Longitudes'] = se.values

'''
Creates a latitude and longitude dataframe column for df_two that will be used for plotting
'''
# latitudes = []
# longitudes = []
# for i in df_two['Coordinates']:
#     i = list(i[1:-1].split(', '))
#     latitudes.append(i[0])
#     longitudes.append(i[1])

# se = pd.Series(latitudes)
# df_two['Latitudes'] = se.values
# se = pd.Series(longitudes)
# df_two['Longitudes'] = se.values

'''
Sets up the plot by adjusting boundaries and calling the GIS
'''
# Masjid Jamek Coordinates: 3.1489° N, 101.6956° E
fig = plt.figure(figsize=(20, 20))
m = Basemap(llcrnrlon = 101.68,
            llcrnrlat = 3.14,
            urcrnrlon = 101.73,
            urcrnrlat = 3.16,
            epsg=3380)

m.arcgisimage(service='World_Topo_Map', xpixels = 2000, verbose= True)

'''
Plots the points onto the plot that was set up
'''
# Map (long, lat) to (x, y) for plotting
x, y = m(101.6956, 3.1489)
plt.plot(x, y, 'X', markersize=10, color = 'red')

for i in range(len(df_one)):
    x, y = m(df_one['Longitudes'].iloc[i], df_one['Latitudes'].iloc[i])
    plt.plot(x, y, 'ok', markersize=3, color = '#086518')

# for i in range(len(df_two)):
#     x, y = m(df_two['Longitudes'].iloc[i], df_two['Latitudes'].iloc[i])
#     plt.plot(x, y, 'ok', markersize=3, color = '#0084bd')

plt.title('Non-work-hours')

plt.show()

fig.savefig('Non-work-hours.png')