In [None]:
import time
import os
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

folder_movement_path = r'../input/movement-by-day/'
x = os.listdir(folder_movement_path)
x.sort()

dims = 0
for file_name in x:
    df = pd.read_csv(os.path.join(folder_movement_path, file_name))
    print('csv file_name = %s\t|====|\t dimension = %s'%(file_name, df.shape))
    dims += df.shape[0]
print('Total observations: %s'%dims)

In [None]:
df = pd.read_csv(os.path.join(folder_movement_path, x[-1]))
for file_name in x[:-1] :
    df_x = pd.read_csv(os.path.join(folder_movement_path, file_name))
    df = df.append(df_x)
print(df.shape)
df.head()

In [None]:
df['utc_datetime'] = df['utc_timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
df.head()

In [None]:
import string

def valid_IP4(address):
    parts = address.split(".")
    if len(parts) != 4:
        return False
    for item in parts:
        if not 0 <= int(item) <= 255:
            return False
    return True

def valid_IP6(address):
    parts = address.split(":")
    if len(parts) == 8:
        for item in parts:
            if len(item) <= 4:
                if all(s in string.hexdigits for s in item):
                    return True
    else:
        return False
    
def ip_type(address):
    if valid_IP4(address):
        return 'IPv4'
    elif valid_IP6(address):
        return 'IPv6'
    elif address == '0':
        return 'unknown_ip'
    else:
        return 'invalid_ip'
    
df['ip_address'] = df['ip_address'].fillna('0')
df['is_ip_valid'] = df['ip_address'].apply(lambda x: ip_type(x))
df.head()

In [None]:
print('Data-dimension; before drop_duplicates:', df.shape)
df.drop_duplicates(inplace = True)
print('Data-dimension; after drop_duplicates:', df.shape)

### Extract the `unique ad_id`

In [None]:
unique_ad_id = list(df['ad_id'].value_counts().index)
len(unique_ad_id)

## 2. Probability model
### 2.1. Define a model of prob
Defined the exactly distance, [ref](https://www.usgs.gov/faqs/how-much-distance-does-a-degree-minute-and-second-cover-your-maps?qt-news_science_products=0#qt-news_science_products)

- One degree of `latitude` equals approximately 364,000 feet (69 miles, 1mile approxs 1.609 km), one minute equals 6,068 feet (1.15 miles), and one-second equals 101 feet. 
- One-degree of `longitude` equals 288,200 feet (54.6 miles), one minute equals 4,800 feet (0.91 mile), and one second equals 80 feet.

In [None]:
from datetime import datetime

def create_sample_df(df, ad_id): 
    ## tọa độ nhà hàng
    long_cen, lat_cen = -96.770709, 32.810594
    
    ## Width per 1 degree of latitude & longitude
    ratio_long, ratio_lat = 54.6*1.609, 1.609*69 
    
    ## extract columns
    sub_df = df[df['ad_id'] == ad_id][['ad_id', 'horizontal_accuracy', 
                                       'latitude', 'longitude', 'utc_datetime']]
    
    sub_df['Han_dist'] = df[df['ad_id'] == ad_id]['distance']  ## use sphere's assumtion
    
    ## tính lại distance ()0
    sub_df['distance'] = np.sqrt(  (ratio_long*(sub_df.longitude.values - (long_cen)))**2
                                 + (ratio_lat*(sub_df.latitude.values - (lat_cen)))**2
                                )
    N = len(sub_df)
    ellapse = []
    diff_distance = []
    for k in range(N):
        if k+1 < N:
            x = str(sub_df['utc_datetime'].values[k]).replace('T', ' ')[:-10]
            y = str(sub_df['utc_datetime'].values[k+1]).replace('T', ' ')[:-10]
            diff = datetime.strptime(y, "%Y-%m-%d %H:%M:%S") - datetime.strptime(x, "%Y-%m-%d %H:%M:%S") 
            diff = diff.seconds
            
            diff_dist = np.sqrt(  (ratio_lat*(sub_df['latitude'].values[k+1] - sub_df['latitude'].values[k]))**2
                                + (ratio_long*(sub_df['longitude'].values[k+1] - sub_df['longitude'].values[k]))**2)
        else:
            diff = 0
            diff_dist = 0
        ellapse.append(diff)
        diff_distance.append(diff_dist)
        
    sub_df['diff_time_sec'] = ellapse
    sub_df['diff_dist_km'] = diff_distance
    sub_df['day'] = sub_df['utc_datetime'].apply(lambda x: x.day)
    
    sub_df['horizontal_accuracy'] = sub_df['horizontal_accuracy'] / 1000
    sub_df = sub_df.rename(columns = {'horizontal_accuracy': 'horz_acc_km'})
    return sub_df

ad_id = '539f95a6-ce6e-4359-bab5-9f5c866936a2'
sub_df = create_sample_df(df, ad_id)

sub_df[sub_df['distance'] < 0.25].head()

In [None]:
new_df = sub_df.set_index('utc_datetime')
day_list = list(new_df.day.value_counts().index)
day_list.sort()
day_list

### Extract observation per day

In [None]:
new_df_day = new_df[new_df['day'] == 24]
new_df_day.head(3)

### Count the movements per day

In [None]:
distance = new_df_day.distance
diff_dist = new_df_day.diff_dist_km.values
diff_time = new_df_day.diff_time_sec.values
count_dist = 0
N = len(diff_dist)

for k in range(N):
    ## this meant if there exist the relax time that longer than 30 seconds
    if (diff_time[k] > 30) & (diff_dist[k] != 0):
        count_dist += 1
count_dist

### 3. Count the direction per day

In [None]:
count_change_dir = 0
for k in range(N - 1):
    ## this meant that if there exist a time that have a change of direction_movement and longer than 10 seconds
    if (diff_dist[k]*diff_dist[k+1] < 0) & (diff_time[k] > 10):
        count_change_dir += 1
count_change_dir        

### 4. Count the number of check-in

In [None]:
horz_acc = new_df_day.horz_acc_km.values
total_dist_2_res = 0
thresh_3, thresh_4 = 50, 2*3600
count_entered = 0
err = 0.17

for k in range(N):
    if (distance[k] < 0.25) & (np.abs(distance[k] - 3*horz_acc[k]) < err) & (diff_time[k] > thresh_3) & (diff_time[k] < thresh_4):
        count_entered += 1
        total_dist_2_res += distance[k]
if (count_entered != 0):
    avg_distance = (total_dist_2_res/count_entered)    
else:
    avg_distance = 0
count_entered, avg_distance

### Wraping up all together

In [None]:
def count_movement(ad_id_df, day, thresh_1 = 30, thresh_2 = 10, thresh_3 = 50, thresh_4 = 7200, err = 0.19):
    """
        Input parameters:
            ad_id_df : dataframe contain ad_id
            day: extracted df by day
            thresh_1: threshold_value of time when a customer stop to move the next step
            thresh_2: thresh_value of time that if there exist a time that have a change of direction_movement
            thresh_3: min_time that the customer visiting a restaurant
            thresh_4: max_time that the customer visiting a restaurant
        Return: Numbers of "change_direction" & "movement_with_stoping_time"
    """
    new_df_day = ad_id_df[ad_id_df['day'] == day]
    distance = new_df_day.distance
    avg_distance_p_day = new_df_day.distance.mean()
    diff_dist = new_df_day.diff_dist_km.values
    diff_time = new_df_day.diff_time_sec.values
    horz_acc = new_df_day.horz_acc_km.values

    total_dist_2_res = 0
    count_change_dir = 0
    count_entered = 0
    count_dist = 0
    
    N = len(diff_dist)
    
    ## only count the day has at least one observation
    if N > 0:
        for k in range(N):
            ## this meant if there exist the relax time that longer than 30 seconds
            if (diff_time[k] > thresh_1) & (diff_dist[k] != 0):
                count_dist += 1
                
            if (distance[k] < 0.25) & (np.abs(distance[k] - 3*horz_acc[k]) < err) & (diff_time[k] > thresh_3) & (diff_time[k] < thresh_4):
                count_entered += 1
                total_dist_2_res += distance[k]
                
                
            ## this meant that if there exist a time that have a change of direction_movement and longer than 10 seconds
            if (k < N - 1):
                if (diff_dist[k]*diff_dist[k+1] < 0) & (diff_time[k] > 10):
                    count_change_dir += 1
            
    
    if (count_entered != 0):
        avg_distance_2_rest = (total_dist_2_res/count_entered)    
    else:
        avg_distance_2_rest = 0

    return count_change_dir, count_dist, count_entered, avg_distance_2_rest, avg_distance_p_day

count_movement(new_df, 24)

### Summarry

In [None]:
def frame_prob(df, ad_id):
    sub_df = create_sample_df(df, ad_id)
    new_df = sub_df.set_index('utc_datetime')

    frame = pd.DataFrame(columns = ['ad_id', 'count_change_dir', 'count_mvm', 'count_entered',
                                    'avg_distance_2_rest', 'avg_distance_p_day', 'prob'])
    for day in range(24, 31):
        count_change_dir, count_dist, count_entered, avg_dist_2_rest, avg_dist_p_day = count_movement(new_df, day)
        if count_dist != 0: 
            prob = count_entered / count_dist
        else:
            prob = 0
        frame.loc[day] = [ad_id, count_change_dir, count_dist, count_entered, avg_dist_2_rest, avg_dist_p_day, prob]

    return frame
frame_prob(df, ad_id)

### Loop over all ad_id

In [None]:
uni_ad_id = unique_ad_id[:10]
uni_ad_id

In [None]:
frame = frame_prob(df, uni_ad_id[0])
for k in range(1, len(uni_ad_id)):
    frame = frame.append(frame_prob(df, uni_ad_id[k]))
frame

In [None]:
frame.to_csv("final_result.csv")