In [1]:
import pandas as pd
import numpy as np

# functions defined for preprocessing

In [2]:
from itertools import permutations

def mergeAP(df,ap):
    '''integrate the information of access points'''
    data = df.merge(ap, on='SSID', how='left')
    return data

def outliers(df):
    '''assumption: RSSI values were constant during small continuous time periods
       correct outliers: using the mode of the records during small continuous time periods'''
    grouplist = ['UnixTime','Latitude','Longitude','SSID']
    for keylist,group in df.groupby(grouplist):
        if group['RSSI(dBm)'].nunique() > 1:
            val_count = pd.DataFrame(group['RSSI(dBm)'].value_counts())
            value = val_count.index[0]
            newind = group[group['RSSI(dBm)'] != value].index
            df.loc[newind,'RSSI(dBm)'] = value
            
def removeone(df):
    '''drop data with only one record from AP'''
    ind = []
    grouplist = ['UnixTime','Latitude','Longitude']
    for iterlist,group in df.groupby(grouplist):
        if len(group)==1:
            ind.append(group.index[0])
    df = df.drop(index=ind)
    return df

def datapreprocessing(df):
    '''clean data'''
    # drop columns: 'NO.' and 'Channel'
    df = df.drop(columns=['No.','Channel'])
    
    # correct outliers
    outliers(df)
    
    # drop_duplicates
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    
    # drop data with only one record from AP
    df = removeone(df)
    
    return df


# permutation without repetition
def mypermutations(ourlist):
    '''For those nodes only having records from three APs, we do permutations without repitition;
    For those nodes only having records from two APs, we do permutations with repitition.
    eg:
        ourlist=[1,2,3], ouput=[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)]
        ourlist=[1,2], ouput=[(1, 2, 1), (2, 1, 1), (1, 1, 2), (1, 2, 2), (2, 1, 2), (2, 2, 1)]'''
    n = len(ourlist)
    if n==3: return list(permutations(ourlist))
    if n==2:
        l1 = [ourlist[0],ourlist[0],ourlist[1]]
        l2 = [ourlist[1],ourlist[0],ourlist[1]]
        res = list(set(permutations(l1))) + list(set(permutations(l2)))
        return res

# argumentation
def AugConcat(df):
    '''data augmentation'''
    value = pd.unique(df['SSID'])
    value = value.tolist()
    pervalue = mypermutations(value)

    arr = pd.DataFrame(columns=range(24))

    for tvalue in pervalue:
        arr2 = df[df['SSID']==tvalue[0]]
        
        for i in tvalue[1:]:
            newarr = df[df['SSID']==i]
            arr2 = pd.concat([arr2.reset_index(drop=True),newarr.reset_index(drop=True)], axis=1, ignore_index=True)
        
        arr = pd.concat([arr.reset_index(drop=True), arr2.reset_index(drop=True)], axis=0, ignore_index=True, join='outer') 
    
    return arr.iloc[:,:24]

# rearrange after augmentation
def AugArrange(df):
    '''rearrange after augmentation'''
    # empty dataframe
    arr1 = pd.DataFrame(columns=range(24))

    for keylist, group in df.groupby(['UnixTime','Latitude','Longitude']):
        arr2 = AugConcat(group)
        arr1 = pd.concat([arr1.reset_index(drop=True), arr2.reset_index(drop=True)], axis=0, ignore_index=True, join='outer') 
        
    # drop duplicated columns
    data = arr1.drop(columns=[8,9,10,16,17,18])
    data.columns = ['UnixTime','Latitude','Longitude','SSID_1','RSSI(dBm)_1','Obstacle_1','latitude_1','longitude_1','SSID_2','RSSI(dBm)_2','Obstacle_2','latitude_2','longitude_2','SSID_3','RSSI(dBm)_3','Obstacle_3','latitude_3','longitude_3']
    data[['SSID_1','RSSI(dBm)_1','Obstacle_1','SSID_2','RSSI(dBm)_2','Obstacle_2','SSID_3','RSSI(dBm)_3','Obstacle_3']] = data[['SSID_1','RSSI(dBm)_1','Obstacle_1','SSID_2','RSSI(dBm)_2','Obstacle_2','SSID_3','RSSI(dBm)_3','Obstacle_3']].astype('int')
    return data


# functions defined for modeling

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import max_error
from sklearn.metrics import r2_score
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor
from xgboost import XGBRegressor

from math import radians, cos, sin, asin, sqrt

def distance(lat1, lat2, lon1, lon2):
    
    # The math module contains a function named
    # radians which converts from degrees to radians.
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(abs(a)))
        
    # Radius of earth in kilometers. Use 3956 for miles
    r = 6378137
      
    # calculate the result
    return(c * r)

def etrm(df):
    x_train = df.iloc[:,3:]
    la_train = df.iloc[:,1]
    lo_train = df.iloc[:,2]
    
    la_model = ExtraTreesRegressor()
    la_model.fit(x_train,la_train)
    
    lo_model = ExtraTreesRegressor()
    lo_model.fit(x_train,lo_train)
    
    return la_model,lo_model

def rfm(df):
    x_train = df.iloc[:,3:]
    la_train = df.iloc[:,1]
    lo_train = df.iloc[:,2]
    
    la_model = RandomForestRegressor()
    la_model.fit(x_train,la_train)
    
    lo_model = RandomForestRegressor()
    lo_model.fit(x_train,lo_train)
    
    return la_model,lo_model

def xgbm(df):
    x_train = df.iloc[:,3:]
    la_train = df.iloc[:,1]
    lo_train = df.iloc[:,2]
    
    la_model = XGBRegressor()
    la_model.fit(x_train,la_train)
    
    lo_model = XGBRegressor()
    lo_model.fit(x_train,lo_train)
    
    return la_model,lo_model


def test(df,la_model,lo_model):
    x_test = df.iloc[:,3:]
    la_test = df.iloc[:,1]
    lo_test = df.iloc[:,2]
    
    la_pred = la_model.predict(x_test)
    print('latitude MAE:', mean_absolute_error(la_test, la_pred))
    print('latitude MSE:', mean_squared_error(la_test, la_pred))
    print('latitude Maximum error:', max_error(la_test, la_pred))
    print('latitude r2 score:', r2_score(la_test, la_pred))
    
    lo_pred = lo_model.predict(x_test)
    print('longitude MAE:', mean_absolute_error(lo_test, lo_pred))
    print('longitude MSE:', mean_squared_error(lo_test, lo_pred))
    print('longitude Maximum error:', max_error(lo_test, lo_pred))
    print('longitude r2 score:', r2_score(lo_test, lo_pred))
    
    dist = list(map(distance, la_test, la_pred, lo_test, lo_pred))
    res = df.iloc[:,1:3]
    res['la_pred'] = la_pred
    res['lo_pred'] = lo_pred
    res['dist'] = dist
    
    derror = []
    for keylist, group in res.groupby(['Latitude','Longitude']):
        la, lo = group['Latitude'].unique(), group['Longitude'].unique() # RP的真实经纬度
        la_avg, lo_avg, dist_avg = np.mean(group['la_pred']), np.mean(group['lo_pred']), np.mean(group['dist'])
        d = distance(la,la_avg,lo,lo_avg)
        derror.append(d)
        print('actual latitude:',la, 'actual longitude:',lo)
        print('estimated latitude:',la_avg,'estimated longitude:', lo_avg)
        print('estimated distance error:',d,'average distance error:', dist_avg)
        
    print('mean error:',np.mean(derror))
    print('max error:',np.max(derror))

# dataset

In [4]:
ap_path = '/Users/weizehui/Documents/比赛/1 itu/1st_Dataset (Rev)/training_AP_info.xlsx'
ap_info = pd.read_excel(ap_path)
ap_info.rename(columns={'Latitude':'ap_Latitude', 'Longitude':'ap_Longitude'}, inplace=True)
ap_info = ap_info.iloc[:,:3]
ap_info

Unnamed: 0,SSID,ap_Latitude,ap_Longitude
0,1,35.158613,136.924697
1,2,35.158638,136.924374
2,3,35.158979,136.924477
3,4,35.158918,136.924765


In [5]:
train_path = '/Users/weizehui/Documents/比赛/1 itu/1st_Dataset (Rev)/training_dataset1_revised.csv'
train0 = pd.read_csv(train_path, parse_dates=['UnixTime'], date_parser=lambda x:pd.to_datetime(x,utc=True,unit='s'))
train0.head()

Unnamed: 0,No.,UnixTime,Latitude,Longitude,SSID,Channel,RSSI(dBm),Obstacle
0,1,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,20,-66,0
1,2,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,20,-66,0
2,3,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,20,-66,0
3,4,2022-07-20 10:53:11+00:00,35.158622,136.924466,1,20,-66,0
4,5,2022-07-20 10:53:11+00:00,35.158622,136.924466,1,20,-66,0


In [6]:
ver_path = '/Users/weizehui/Documents/比赛/1 itu/1st_Dataset (Rev)/verification_dataset_revised.csv'
ver0 = pd.read_csv(ver_path, parse_dates=['UnixTime'], date_parser=lambda x:pd.to_datetime(x,utc=True,unit='s'))
ver0.head()

Unnamed: 0,No.,UnixTime,Latitude,Longitude,SSID,Channel,RSSI(dBm),Obstacle
0,1,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0
1,2,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0
2,3,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0
3,4,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0
4,5,2022-07-20 11:14:38+00:00,35.158649,136.924393,1,20,-75,0


# integrate the information of AP into the training and verification datasets

In [7]:
train = mergeAP(train0,ap_info)
train.head()

Unnamed: 0,No.,UnixTime,Latitude,Longitude,SSID,Channel,RSSI(dBm),Obstacle,ap_Latitude,ap_Longitude
0,1,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,20,-66,0,35.158613,136.924697
1,2,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,20,-66,0,35.158613,136.924697
2,3,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,20,-66,0,35.158613,136.924697
3,4,2022-07-20 10:53:11+00:00,35.158622,136.924466,1,20,-66,0,35.158613,136.924697
4,5,2022-07-20 10:53:11+00:00,35.158622,136.924466,1,20,-66,0,35.158613,136.924697


In [8]:
ver = mergeAP(ver0,ap_info)
ver.head()

Unnamed: 0,No.,UnixTime,Latitude,Longitude,SSID,Channel,RSSI(dBm),Obstacle,ap_Latitude,ap_Longitude
0,1,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0,35.158613,136.924697
1,2,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0,35.158613,136.924697
2,3,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0,35.158613,136.924697
3,4,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,20,-75,0,35.158613,136.924697
4,5,2022-07-20 11:14:38+00:00,35.158649,136.924393,1,20,-75,0,35.158613,136.924697


# preprocessing: data cleaning/outliers correction/data augmentation

In [9]:
train = datapreprocessing(train)
train

Unnamed: 0,UnixTime,Latitude,Longitude,SSID,RSSI(dBm),Obstacle,ap_Latitude,ap_Longitude
0,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,-66,0,35.158613,136.924697
1,2022-07-20 10:53:11+00:00,35.158622,136.924466,1,-66,0,35.158613,136.924697
2,2022-07-20 10:53:12+00:00,35.158622,136.924466,1,-66,0,35.158613,136.924697
3,2022-07-20 10:53:13+00:00,35.158622,136.924466,1,-66,0,35.158613,136.924697
4,2022-07-20 10:53:14+00:00,35.158622,136.924466,1,-66,0,35.158613,136.924697
...,...,...,...,...,...,...,...,...
704,2022-07-20 11:11:43+00:00,35.159011,136.924433,4,-71,0,35.158918,136.924765
705,2022-07-20 11:11:44+00:00,35.159011,136.924433,4,-71,0,35.158918,136.924765
706,2022-07-20 11:11:45+00:00,35.159011,136.924433,4,-71,0,35.158918,136.924765
707,2022-07-20 11:11:46+00:00,35.159011,136.924433,4,-71,0,35.158918,136.924765


In [10]:
ver = datapreprocessing(ver)
ver

Unnamed: 0,UnixTime,Latitude,Longitude,SSID,RSSI(dBm),Obstacle,ap_Latitude,ap_Longitude
0,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,-75,0,35.158613,136.924697
1,2022-07-20 11:14:38+00:00,35.158649,136.924393,1,-75,0,35.158613,136.924697
2,2022-07-20 11:14:39+00:00,35.158649,136.924393,1,-75,0,35.158613,136.924697
3,2022-07-20 11:14:40+00:00,35.158649,136.924393,1,-75,0,35.158613,136.924697
4,2022-07-20 11:14:41+00:00,35.158649,136.924393,1,-75,0,35.158613,136.924697
...,...,...,...,...,...,...,...,...
731,2022-07-20 11:42:56+00:00,35.158887,136.924521,4,-59,0,35.158918,136.924765
732,2022-07-20 11:42:57+00:00,35.158887,136.924521,4,-59,0,35.158918,136.924765
733,2022-07-20 11:42:58+00:00,35.158887,136.924521,4,-59,0,35.158918,136.924765
734,2022-07-20 11:42:59+00:00,35.158887,136.924521,4,-59,0,35.158918,136.924765


In [11]:
train_aug = AugArrange(train)
train_aug

Unnamed: 0,UnixTime,Latitude,Longitude,SSID_1,RSSI(dBm)_1,Obstacle_1,latitude_1,longitude_1,SSID_2,RSSI(dBm)_2,Obstacle_2,latitude_2,longitude_2,SSID_3,RSSI(dBm)_3,Obstacle_3,latitude_3,longitude_3
0,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,-66,0,35.158613,136.924697,2,-62,0,35.158638,136.924374,3,-63,0,35.158979,136.924477
1,2022-07-20 10:53:10+00:00,35.158622,136.924466,1,-66,0,35.158613,136.924697,3,-63,0,35.158979,136.924477,2,-62,0,35.158638,136.924374
2,2022-07-20 10:53:10+00:00,35.158622,136.924466,2,-62,0,35.158638,136.924374,1,-66,0,35.158613,136.924697,3,-63,0,35.158979,136.924477
3,2022-07-20 10:53:10+00:00,35.158622,136.924466,2,-62,0,35.158638,136.924374,3,-63,0,35.158979,136.924477,1,-66,0,35.158613,136.924697
4,2022-07-20 10:53:10+00:00,35.158622,136.924466,3,-63,0,35.158979,136.924477,1,-66,0,35.158613,136.924697,2,-62,0,35.158638,136.924374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,2022-07-20 11:18:32+00:00,35.158771,136.924543,2,-61,0,35.158638,136.924374,3,-72,0,35.158979,136.924477,2,-61,0,35.158638,136.924374
1532,2022-07-20 11:18:32+00:00,35.158771,136.924543,3,-72,0,35.158979,136.924477,2,-61,0,35.158638,136.924374,2,-61,0,35.158638,136.924374
1533,2022-07-20 11:18:32+00:00,35.158771,136.924543,3,-72,0,35.158979,136.924477,3,-72,0,35.158979,136.924477,2,-61,0,35.158638,136.924374
1534,2022-07-20 11:18:32+00:00,35.158771,136.924543,2,-61,0,35.158638,136.924374,3,-72,0,35.158979,136.924477,3,-72,0,35.158979,136.924477


In [12]:
ver_aug = AugArrange(ver)
ver_aug

Unnamed: 0,UnixTime,Latitude,Longitude,SSID_1,RSSI(dBm)_1,Obstacle_1,latitude_1,longitude_1,SSID_2,RSSI(dBm)_2,Obstacle_2,latitude_2,longitude_2,SSID_3,RSSI(dBm)_3,Obstacle_3,latitude_3,longitude_3
0,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,-75,0,35.158613,136.924697,2,-63,0,35.158638,136.924374,3,-72,0,35.158979,136.924477
1,2022-07-20 11:14:37+00:00,35.158649,136.924393,1,-75,0,35.158613,136.924697,3,-72,0,35.158979,136.924477,2,-63,0,35.158638,136.924374
2,2022-07-20 11:14:37+00:00,35.158649,136.924393,2,-63,0,35.158638,136.924374,1,-75,0,35.158613,136.924697,3,-72,0,35.158979,136.924477
3,2022-07-20 11:14:37+00:00,35.158649,136.924393,2,-63,0,35.158638,136.924374,3,-72,0,35.158979,136.924477,1,-75,0,35.158613,136.924697
4,2022-07-20 11:14:37+00:00,35.158649,136.924393,3,-72,0,35.158979,136.924477,1,-75,0,35.158613,136.924697,2,-63,0,35.158638,136.924374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1537,2022-07-20 11:48:32+00:00,35.158593,136.924688,2,-68,0,35.158638,136.924374,1,-61,0,35.158613,136.924697,1,-61,0,35.158613,136.924697
1538,2022-07-20 11:48:32+00:00,35.158593,136.924688,1,-61,0,35.158613,136.924697,1,-61,0,35.158613,136.924697,2,-68,0,35.158638,136.924374
1539,2022-07-20 11:48:32+00:00,35.158593,136.924688,1,-61,0,35.158613,136.924697,2,-68,0,35.158638,136.924374,2,-68,0,35.158638,136.924374
1540,2022-07-20 11:48:32+00:00,35.158593,136.924688,2,-68,0,35.158638,136.924374,1,-61,0,35.158613,136.924697,2,-68,0,35.158638,136.924374


# model

# 1. random forest

In [13]:
la_model, lo_model = rfm(train_aug)

In [14]:
test(ver_aug,la_model,lo_model)

latitude MAE: 5.369152942182182e-05
latitude MSE: 4.182667688732198e-09
latitude Maximum error: 0.00013189614062270039
latitude r2 score: 0.7371381388498461
longitude MAE: 9.723840085247111e-05
longitude MSE: 1.2381983366624822e-08
longitude Maximum error: 0.00021416364967308255
longitude r2 score: -0.015371057734653526
actual latitude: [35.15859294] actual longitude: [136.9246883]
estimated latitude: 35.15860911002628 estimated longitude: 136.92448906225206
estimated distance error: 18.221852101060364 average distance error: 18.543960541631307
actual latitude: [35.1586185] actual longitude: [136.9244435]
estimated latitude: 35.15859341613687 estimated longitude: 136.9245609632483
estimated distance error: 11.049047754974698 average distance error: 11.07070748957955
actual latitude: [35.15863558] actual longitude: [136.9244597]
estimated latitude: 35.15860844133333 estimated longitude: 136.92437606233307
estimated distance error: 8.189498988537823 average distance error: 8.194638935067

# 2. extra tree regression model

In [15]:
la_model, lo_model = etrm(train_aug)

In [16]:
test(ver_aug,la_model,lo_model)

latitude MAE: 4.223817102560019e-05
latitude MSE: 3.1838338661039287e-09
latitude Maximum error: 0.00014380490001286717
latitude r2 score: 0.7999103543674915
longitude MAE: 6.623959165335234e-05
longitude MSE: 7.250602056426717e-09
longitude Maximum error: 0.00022280663418428048
longitude r2 score: 0.40542227676616116
actual latitude: [35.15859294] actual longitude: [136.9246883]
estimated latitude: 35.15859225653707 estimated longitude: 136.92461706014564
estimated distance error: 6.484021459269321 average distance error: 6.501057521138336
actual latitude: [35.1586185] actual longitude: [136.9244435]
estimated latitude: 35.15863681693231 estimated longitude: 136.9245398961929
estimated distance error: 9.006901389723305 average distance error: 9.016941894247346
actual latitude: [35.15863558] actual longitude: [136.9244597]
estimated latitude: 35.158634895363285 estimated longitude: 136.92445434759242
estimated distance error: 0.4930510159258418 average distance error: 1.063963416152091

# 3. xgboost

In [17]:
la_model, lo_model = xgbm(train_aug)

In [18]:
test(ver_aug,la_model,lo_model)

latitude MAE: 4.037186399261437e-05
latitude MSE: 2.488131903218072e-09
latitude Maximum error: 9.890176757920699e-05
latitude r2 score: 0.8436320952226495
longitude MAE: 8.245387636247191e-05
longitude MSE: 1.0237625128104054e-08
longitude Maximum error: 0.0002504707031221187
longitude r2 score: 0.16047470367040018
actual latitude: [35.15859294] actual longitude: [136.9246883]
estimated latitude: 35.1586799621582 estimated longitude: 136.92466735839844
estimated distance error: 9.872969377026617 average distance error: 15.123628544367316
actual latitude: [35.1586185] actual longitude: [136.9244435]
estimated latitude: 35.15868377685547 estimated longitude: 136.92465209960938
estimated distance error: 20.327904577372443 average distance error: 10.53709359961084
actual latitude: [35.15863558] actual longitude: [136.9244597]
estimated latitude: 35.15868377685547 estimated longitude: 136.9246063232422
estimated distance error: 14.382446058593919 average distance error: 6.060551707198461
a