# Assignment2, CMPT826

##  STEP 1: Preparation

* Seyedeh Mina Mousavifar
* 11279515
* sem311

### Data Collection

In [None]:
import pandas as pd
import sqlalchemy as db

engine = db.create_engine('mysql://?@crepe.usask.ca:3306/SHED10')

connection = engine.connect()
metadata = db.MetaData()

# Equivalent to 'SELECT * FROM battery
battery = db.Table('battery', metadata, autoload=True, autoload_with=engine)
query = db.select([battery])

# getting data by executing the query above
BatteryResultProxy = connection.execute(query)
BatteryResultSet = BatteryResultProxy.fetchall()

# converting data to data frame
battery_data = pd.DataFrame(BatteryResultSet)
battery_data.columns = BatteryResultSet[0].keys()

# removing index column from data - 0 for rows and 1 for column
battery_data = battery_data.drop('index', 1)

# Equivalent to 'SELECT * FROM gps
gps = db.Table('gps', metadata, autoload=True, autoload_with=engine)
query = db.select([gps])

# getting data by executing the query above
GPSResultProxy = connection.execute(query)
GPSResultSet = GPSResultProxy.fetchall()

# converting data to data frame
gps_data = pd.DataFrame(GPSResultSet)
gps_data.columns = GPSResultSet[0].keys()

# removing index column from data - 0 for rows and 1 for column
gps_data = gps_data.drop('index', 1)

print('fetching data completed')

### Filtering Data

In [None]:
# counting number of battery information per user
battery_info = battery_data.groupby(['user_id']).size().reset_index(name='record_count')

# calculating filtering cutoff
cutoff_percentage = 0.5
max_battery_info = (60 / 5) * 24 * 30
battery_cutoff = cutoff_percentage * max_battery_info

# filtering users with less than 50%
battery_info_50 = battery_info.loc[battery_info['record_count'] > battery_cutoff]
users_filter_50, _ = battery_info_50.shape

# preserving only users with more than 50% battery record
user_battery = pd.merge(left=battery_data, right=battery_info_50, 
                           left_on='user_id', right_on='user_id')

# filtering accuracy more than 100
gps_data = gps_data.loc[gps_data['accu'] < 100]

# outside latitude range
gps_data = gps_data.loc[gps_data['lat'] > 52.058366]
gps_data = gps_data.loc[gps_data['lat'] < 52.214609]

# outside longitude range
gps_data = gps_data[gps_data['lon'] > -106.7649138128]
gps_data = gps_data.loc[gps_data['lon'] < -106.52225319]

# creating dataframe for filtering Saskatoon data for preferred users
good_50_user_id = user_battery.user_id.unique()
gps_data = gps_data[gps_data.user_id.isin(good_50_user_id)]

#### Stratify and Aggregate

In [None]:
import datetime
import math
from pyproj import Proj

# removing unnecessary columns
gps_data = gps_data.drop(['alt', 'bearing', 
                          'speed', 'record_time_minute', 
                          'timestamp', 'pokemon'], 1)

# sorting based on time 
gps_data = gps_data.sort_values(['user_id', 'record_time']).dropna().reset_index()

# Converting record time to separate Date and Time variable
gps_data['Dates'] = pd.to_datetime(gps_data['record_time']).dt.date
gps_data['Time'] = pd.to_datetime(gps_data['record_time']).dt.time
gps_data['Hour'] = pd.to_datetime(gps_data['record_time']).dt.hour
gps_data['Minute'] = pd.to_datetime(gps_data['record_time']).dt.minute
gps_data['Second'] = pd.to_datetime(gps_data['record_time']).dt.second

# removing December test data
testdate = datetime.datetime.strptime('2016-12-09', "%Y-%m-%d").date()
gps_data = gps_data[(gps_data['Dates'] > testdate)]

# finding study start date by finding minimum date after test date in December!
start_time = gps_data.record_time.min()

# finding study end date by finding maximum date
end_time = gps_data.record_time.max()

# total number of duty cycles during study
n_duty = math.ceil((((end_time - start_time).total_seconds())/60)/5)

# first column as each duty cycle start time
start_duty = pd.date_range(start_time, periods=n_duty, freq='5min')

# getting second item of previous dataframe as first duty cycle end time
# second column as each duty cycle end time
end_duty = pd.date_range(start_duty[1], periods=n_duty, freq='5min')

duty_num = pd.Series(range(1,n_duty+1))

duty_data = pd.DataFrame({'duty': duty_num,
                          'start_time': start_duty,
                          'end_time': end_duty})

def calc_duty(time):
    '''
    This functions find duty cycle of specific time during study
    :param time: record time
    :return: duty cycle of given record time
    '''
    result = duty_data[(duty_data['start_time'] <= time) & (duty_data['end_time'] > time)]
    if result.empty:
        print('no duty cycle')
    return result.iloc[0].duty

# finding duty cycle for gps records
gps_data['duty_num'] = gps_data.apply(lambda x: calc_duty(x.record_time), axis=1)


# calculating mean of latitude and longitude for every duty cycle
gps_data = gps_data.astype({'lat': 'float64', 'lon': 'float64'})
gps_data = gps_data.groupby(['user_id', 
                             'duty_num']).agg(lat=('lat', 'mean'), 
                                               lon=('lon', 'mean')).reset_index()

# converting to UTM
myproj = Proj('epsg:32613', proj='utm', zone=13, 
              ellps='WGS84', preserve_units=True)

gps_data['x'], gps_data['y'] = myproj(gps_data['lon'].values, 
                                      gps_data['lat'].values)
# binning
GRID_SIZE = 100

# find grid start point
start_x, start_y = gps_data.x.min(), gps_data.y.min()

# labeling grids
gps_data['x_grid'] = np.ceil((gps_data['x'] - start_x)/GRID_SIZE)
gps_data['y_grid'] = np.ceil((gps_data['y'] - start_y)/GRID_SIZE)

# sort data
gps_data = gps_data.sort_values(['user_id', 'duty_num']).dropna()

# creating grid cell labels (x,y)
gps_data = gps_data.astype({'x_grid': 'int32', 
                            'y_grid': 'int32'}).astype({'x_grid': 'str', 
                                                        'y_grid': 'str'})
gps_data['grid_label'] = gps_data['x_grid'] + ',' + gps_data['y_grid']
gps_data = gps_data.astype({'x_grid': 'int32', 'y_grid': 'int32'})