Datasets from (Link): https://zenodo.org/record/1167595 were used in order to study the speed of the vessels, in addition to the weather conditions of each time period. More specifically, the location for the vessels is known on each time period, so the closest weather station can be found, that will provide us with the weather condition present each time period for each vessel. Also a DecisionTreeRegressor prediction algorithm is used in order to predict the beaufort at a certain time.

In [1]:
import pandas as pd
import numpy as np
import math
import time
import pyspark

In [2]:
df_1 = pd.read_csv('C:/Users/haris/Desktop/erg/nari_dynamic.csv')

In [3]:
df_1.head()

Unnamed: 0,sourcemmsi,navigationalstatus,rateofturn,speedoverground,courseoverground,trueheading,lon,lat,t
0,245257000,0.0,0.0,0.1,13.1,36,-4.465718,48.38249,1443650402
1,227705102,15.0,-127.0,0.0,262.7,511,-4.496571,48.38242,1443650403
2,228131600,15.0,-127.0,8.5,263.7,511,-4.644325,48.092247,1443650404
3,228051000,0.0,-127.0,0.0,295.0,511,-4.485108,48.38132,1443650405
4,227574020,15.0,-127.0,0.1,248.6,511,-4.495441,48.38366,1443650406


In [4]:
df_2 = pd.read_csv('C:/Users/haris/Desktop/erg/table_wheatherObservation.csv')

In [5]:
df_2.head()

Unnamed: 0,id_station,local_time,T,Tn,Tx,P,U,id_windDirection,Ff,ff10,ff3,VV,Td,RRR,tR
0,3803,1459551600,9.5,-65536.0,-65536.0,758.1,97,9,15,-65536,20,6.0,9.0,-65536.0,-65536
1,3803,1459548000,9.4,-65536.0,-65536.0,758.2,96,9,15,-65536,20,5.0,8.8,-65536.0,-65536
2,3803,1459544400,9.8,-65536.0,-65536.0,758.2,94,9,16,-65536,20,8.0,8.8,-65536.0,-65536
3,3803,1459540800,9.7,-65536.0,-65536.0,758.4,94,9,15,-65536,19,6.0,8.8,-65536.0,-65536
4,3803,1459537200,9.9,-65536.0,10.9,758.6,92,9,15,-65536,20,7.0,8.6,0.2,12


In [6]:
df_3 = pd.read_csv('C:/Users/haris/Desktop/erg/table_weatherStation.csv')

In [7]:
df_3.head()

Unnamed: 0,id_station,station_name,latitude,longitude,elevation
0,7107,BRIGNOGAN,48.6833,-4.0333,20
1,7100,OUESSANT,48.4733,-5.0569,68
2,7117,PERROS GUIREC,48.8258,-3.47295,48
3,7207,BELLE ILE,47.3,-3.2167,37
4,7120,SAINT BRIEUX,48.53333,-2.0833,138


In [8]:
df_1.drop(['navigationalstatus','rateofturn','courseoverground','trueheading'],axis = 1, inplace=True)

In [9]:
df_1.head()

Unnamed: 0,sourcemmsi,speedoverground,lon,lat,t
0,245257000,0.1,-4.465718,48.38249,1443650402
1,227705102,0.0,-4.496571,48.38242,1443650403
2,228131600,8.5,-4.644325,48.092247,1443650404
3,228051000,0.0,-4.485108,48.38132,1443650405
4,227574020,0.1,-4.495441,48.38366,1443650406


In [10]:
weather_data = pd.merge(df_2,df_3, on = 'id_station', how='outer')

In [11]:
weather_data.head()

Unnamed: 0,id_station,local_time,T,Tn,Tx,P,U,id_windDirection,Ff,ff10,ff3,VV,Td,RRR,tR,station_name,latitude,longitude,elevation
0,3803,1459551600,9.5,-65536.0,-65536.0,758.1,97,9,15,-65536,20,6.0,9.0,-65536.0,-65536,ISLES OF SCILLY,49.9167,-6.3,31
1,3803,1459548000,9.4,-65536.0,-65536.0,758.2,96,9,15,-65536,20,5.0,8.8,-65536.0,-65536,ISLES OF SCILLY,49.9167,-6.3,31
2,3803,1459544400,9.8,-65536.0,-65536.0,758.2,94,9,16,-65536,20,8.0,8.8,-65536.0,-65536,ISLES OF SCILLY,49.9167,-6.3,31
3,3803,1459540800,9.7,-65536.0,-65536.0,758.4,94,9,15,-65536,19,6.0,8.8,-65536.0,-65536,ISLES OF SCILLY,49.9167,-6.3,31
4,3803,1459537200,9.9,-65536.0,10.9,758.6,92,9,15,-65536,20,7.0,8.6,0.2,12,ISLES OF SCILLY,49.9167,-6.3,31


In [12]:
weather_data.drop(['T','Tn','Tx','P','U','id_windDirection','ff10','ff3','VV','Td','RRR','tR','elevation'],axis = 1, inplace=True)

In [13]:
weather_data.head()

Unnamed: 0,id_station,local_time,Ff,station_name,latitude,longitude
0,3803,1459551600,15,ISLES OF SCILLY,49.9167,-6.3
1,3803,1459548000,15,ISLES OF SCILLY,49.9167,-6.3
2,3803,1459544400,16,ISLES OF SCILLY,49.9167,-6.3
3,3803,1459540800,15,ISLES OF SCILLY,49.9167,-6.3
4,3803,1459537200,15,ISLES OF SCILLY,49.9167,-6.3


In [14]:
def calculateDistance(x1,y1,x2,y2):  
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)  
    return dist 

In [15]:
def wind_bft(ms):
    if (0<= ms and ms<=0.2):
        ms=0
        return(ms)
    elif (0.3<=ms and ms<=1.5):
        ms=1
        return(ms)
    elif (1.6<=ms and ms<=3.3):
        ms=2
        return(ms)
    elif (3.4<=ms and ms<=5.4):
        ms=3
        return(ms)
    elif (5.5<=ms and ms<=7.9):
        ms=4
        return(ms)
    elif (8.0<=ms and ms<=10.7):
        ms=5
        return(ms)
    elif (10.8<=ms and ms<=13.8):
        ms=6
        return(ms)
    elif (13.9<=ms and ms<=17.1):
        ms=7
        return(ms)
    elif (17.2<=ms and ms<=20.7):
        ms=8
        return(ms)
    elif (20.8<=ms and ms<=24.4):
        ms=9
        return(ms)
    elif (24.5<=ms and ms<=28.4):
        ms=10
        return(ms)
    elif (28.5<=ms and ms<=32.6):
        ms=11
        return(ms)
    elif (32.7<=ms):
        ms=12
        return(ms)

In [16]:
a = df_1['t'].unique().tolist()
b = weather_data['local_time'].unique().tolist()

In [17]:
c = list(set(b) & set(a))

In [18]:
c

[1457971200,
 1456128000,
 1450598400,
 1452441600,
 1454284800,
 1448755200,
 1446912000,
 1459011600,
 1457168400,
 1451638800,
 1449795600,
 1447952400,
 1446109200,
 1444266000,
 1458208800,
 1454522400,
 1452679200,
 1450836000,
 1448992800,
 1447149600,
 1459249200,
 1457406000,
 1455562800,
 1453719600,
 1450033200,
 1458446400,
 1454760000,
 1449230400,
 1447387200,
 1445544000,
 1443700800,
 1457643600,
 1455800400,
 1450270800,
 1448427600,
 1446584400,
 1456840800,
 1453154400,
 1451311200,
 1445781600,
 1443938400,
 1457881200,
 1456038000,
 1454194800,
 1448665200,
 1446822000,
 1444978800,
 1458921600,
 1455235200,
 1453392000,
 1447862400,
 1446019200,
 1444176000,
 1458118800,
 1456275600,
 1454432400,
 1448902800,
 1447059600,
 1457316000,
 1455472800,
 1453629600,
 1446256800,
 1454670000,
 1452826800,
 1449140400,
 1447297200,
 1445454000,
 1455710400,
 1453867200,
 1450180800,
 1448337600,
 1451221200,
 1449378000,
 1447534800,
 1445691600,
 1443848400,
 1458594000,

In [19]:
#grhgoros tropos 20 lepta
start_time=time.time()
k={}
for i in c:
    df_ships = df_1.loc[df_1['t']==i]
    df_weather = weather_data.loc[weather_data['local_time']==i]
    df_ships['station_name'] = 0
    df_ships['Ff'] = 0
    ship_coor = df_ships[['lon','lat']].values
    weather_coor = df_weather[['longitude','latitude']].values
    dist_matrix = np.linalg.norm(ship_coor[:, np.newaxis] - weather_coor, axis = 2)
    for row in range(len(dist_matrix)):
        mini = np.argmin(dist_matrix[row])
        df_ships.iloc[row,-2]=df_weather.iloc[mini]['station_name']
        df_ships.iloc[row,-1]=wind_bft(df_weather.iloc[mini]['Ff'])
    k[i]=df_ships
print(time.time()-start_time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


86.06441688537598


In [20]:
final_data=[]
for i in k.values():
    print(i)
    final_data.append(i)

          sourcemmsi  speedoverground       lon        lat           t  \
13602158   227592820             16.9 -4.483425  48.350903  1457971200   
13602159   228762000              0.2 -4.506792  48.374588  1457971200   

         station_name  Ff  
13602158    BRIGNOGAN   7  
13602159     OUESSANT   6  
          sourcemmsi  speedoverground       lon       lat           t  \
11624163   226178000              0.0 -4.325843  48.09822  1456128000   

         station_name  Ff  
11624163    BRIGNOGAN   4  
          sourcemmsi  speedoverground       lon        lat           t  \
6457456    228051000              0.0 -4.485086  48.381320  1450598400   
6457457    227580520              0.0 -4.497093  48.379566  1450598400   
16195655   226318000             19.6 -4.759283  48.212704  1450598400   

         station_name  Ff  
6457456     BRIGNOGAN   3  
6457457     BRIGNOGAN   3  
16195655     OUESSANT   5  
          sourcemmsi  speedoverground       lon      lat           t  \
16625356 

         sourcemmsi  speedoverground       lon        lat           t  \
8502524   228186700            102.3 -4.698999  48.321835  1453302000   

        station_name  Ff  
8502524     OUESSANT   5  
         sourcemmsi  speedoverground       lon       lat           t  \
6968005   228017700              0.1 -4.485722  48.38107  1451458800   
6968006   227574020             15.4 -4.413442  48.29532  1451458800   

        station_name  Ff  
6968005    BRIGNOGAN   6  
6968006    BRIGNOGAN   6  
          sourcemmsi  speedoverground       lon        lat           t  \
3856393    227574020              2.2 -4.495892  48.352936  1447772400   
15567076   227008170             12.2 -4.738165  48.314934  1447772400   

         station_name  Ff  
3856393     BRIGNOGAN   6  
15567076     OUESSANT   6  
         sourcemmsi  speedoverground       lon      lat           t  \
2189816   228017700              0.0 -4.485743  48.3811  1445929200   

        station_name  Ff  
2189816    BRIGNOGAN   2

4727173    BRIGNOGAN   4  
          sourcemmsi  speedoverground       lon        lat           t  \
14338518   227580520              0.0 -4.497119  48.379585  1458799200   

         station_name  Ff  
14338518    BRIGNOGAN   3  
          sourcemmsi  speedoverground       lon        lat           t  \
12443726   228762000              0.5 -4.520793  48.334965  1456956000   
12727012   228762000              0.5 -4.520793  48.334965  1456956000   

         station_name  Ff  
12443726     OUESSANT   7  
12727012     OUESSANT   7  
         sourcemmsi  speedoverground       lon        lat           t  \
8455770   246497000             13.4 -4.895814  48.227467  1453269600   

        station_name  Ff  
8455770     OUESSANT   4  
         sourcemmsi  speedoverground       lon        lat           t  \
5643745   577228000              5.5 -4.369060  48.165080  1449583200   
5643746   228051000              0.0 -4.485077  48.381350  1449583200   
5643747   227380000             10.7 -4.6

        sourcemmsi  speedoverground       lon        lat           t  \
742398   228160000              0.5 -4.799332  47.914474  1444348800   
742399   305137000              8.3 -5.268665  48.241585  1444348800   

       station_name  Ff  
742398     OUESSANT   4  
742399     OUESSANT   4  
          sourcemmsi  speedoverground       lon       lat           t  \
13909887   227705102              0.0 -4.495471  48.38364  1458291600   
18969601   211707600              0.0 -4.589345  48.27980  1458291600   

         station_name  Ff  
13909887    BRIGNOGAN   3  
18969601     OUESSANT   5  
          sourcemmsi  speedoverground       lon       lat           t  \
10100941   234056000              1.8 -4.434371  48.17744  1454605200   

         station_name  Ff  
10100941    BRIGNOGAN   4  
         sourcemmsi  speedoverground       lon        lat           t  \
7874513   228236700              0.0 -4.325692  48.098213  1452762000   

        station_name  Ff  
7874513    BRIGNOGAN   6

          sourcemmsi  speedoverground       lon        lat           t  \
11186831   228109000              4.3 -4.687597  48.121906  1455703200   
11186832   227635210              0.0 -4.495445  48.383680  1455703200   

         station_name  Ff  
11186831     OUESSANT   6  
11186832    BRIGNOGAN   3  
          sourcemmsi  speedoverground       lon        lat           t  \
16925662   256765000              6.8 -5.051548  48.185932  1453860000   

         station_name  Ff  
16925662     OUESSANT   7  
          sourcemmsi  speedoverground       lon        lat           t  \
7312403    227003050              0.0 -4.485952  48.380730  1452016800   
16493238   636091933             14.8 -4.450742  48.377808  1452016800   

         station_name  Ff  
7312403     BRIGNOGAN   5  
16493238    BRIGNOGAN   5  
         sourcemmsi  speedoverground       lon       lat           t  \
6105996   999999999              0.0 -4.513325  48.36992  1450173600   

        station_name  Ff  
6105996  

13629728     OUESSANT   7  
          sourcemmsi  speedoverground       lon       lat           t  \
11640627   227941000              0.0 -4.327222  48.10009  1456146000   

         station_name  Ff  
11640627    BRIGNOGAN   4  
         sourcemmsi  speedoverground      lon       lat           t  \
9761217   228394000              1.9 -5.04398  48.10244  1454302800   

        station_name  Ff  
9761217     OUESSANT   5  
          sourcemmsi  speedoverground      lon        lat           t  \
16628854   356071000              0.4 -4.32296  48.118793  1452459600   

         station_name  Ff  
16628854    BRIGNOGAN   3  
         sourcemmsi  speedoverground       lon        lat           t  \
6469534   228051000              0.0 -4.485060  48.381332  1450616400   
6469535   227580520              0.0 -4.497123  48.379560  1450616400   

        station_name  Ff  
6469534    BRIGNOGAN   4  
6469535    BRIGNOGAN   4  
          sourcemmsi  speedoverground       lon        lat          

         sourcemmsi  speedoverground       lon        lat           t  \
1439443   228186700              0.0 -4.512498  48.370834  1445144400   
1439444   228727000              0.0 -4.513979  48.369410  1445144400   
1439445   305886000             16.2 -5.099165  47.911167  1445144400   

        station_name  Ff  
1439443     OUESSANT   5  
1439444     OUESSANT   5  
1439445     OUESSANT   5  
          sourcemmsi  speedoverground       lon       lat           t  \
14578346   227574020              0.0 -4.495335  48.38368  1459087200   

         station_name  Ff  
14578346    BRIGNOGAN   4  
          sourcemmsi  speedoverground       lon        lat           t  \
12979612   228186700            102.3 -4.512592  48.370846  1457244000   
12979613   228762000              0.1 -4.506780  48.374622  1457244000   

         station_name  Ff  
12979612     OUESSANT   4  
12979613     OUESSANT   4  
          sourcemmsi  speedoverground       lon        lat           t  \
10836084   2270

2666234     OUESSANT   5  
          sourcemmsi  speedoverground       lon        lat           t  \
14122700   228736000              0.0 -4.514655  48.369926  1458579600   
14122701   234056000             10.4 -4.640118  48.295166  1458579600   
14122702   228144000              9.6 -4.441556  48.128210  1458579600   

         station_name  Ff  
14122700     OUESSANT   5  
14122701     OUESSANT   5  
14122702    BRIGNOGAN   5  
          sourcemmsi  speedoverground       lon        lat           t  \
17495747   228015700              9.9 -4.467125  48.126106  1456736400   
18842545   227315110              6.0 -4.439383  48.363890  1456736400   

         station_name  Ff  
17495747     OUESSANT   4  
18842545    BRIGNOGAN   4  
          sourcemmsi  speedoverground       lon       lat           t  \
17147300   228267900             10.7 -4.734507  48.30184  1454893200   

         station_name  Ff  
17147300     OUESSANT   8  
         sourcemmsi  speedoverground       lon       l

         sourcemmsi  speedoverground       lon        lat           t  \
9646418   227148000              7.2 -4.914667  47.928772  1454148000   

        station_name  Ff  
9646418     OUESSANT   6  
         sourcemmsi  speedoverground       lon        lat           t  \
6361352   636015106              1.2 -4.454190  48.380005  1450461600   
6361353   228109000              2.5 -4.693402  48.120434  1450461600   

        station_name  Ff  
6361352    BRIGNOGAN   4  
6361353     OUESSANT   5  
         sourcemmsi  speedoverground       lon        lat           t  \
2970831   228186700              0.0 -4.512832  48.370667  1446775200   

        station_name  Ff  
2970831     OUESSANT   5  
         sourcemmsi  speedoverground       lon        lat           t  \
5719287   227380000              9.6 -4.855615  48.116882  1449658800   

        station_name  Ff  
5719287     OUESSANT   5  
          sourcemmsi  speedoverground       lon        lat           t  \
14406407   234056000  

          sourcemmsi  speedoverground       lon        lat           t  \
10716701   227003050              0.0 -4.485960  48.380753  1455246000   
10716702   228051000              0.0 -4.485093  48.381300  1455246000   

         station_name  Ff  
10716701    BRIGNOGAN   4  
10716702    BRIGNOGAN   4  
         sourcemmsi  speedoverground       lon        lat           t  \
8678556   249297000             13.1 -5.092092  48.304726  1453402800   

        station_name  Ff  
8678556     OUESSANT   6  
         sourcemmsi  speedoverground       lon       lat           t  \
7028641   228051000              0.0 -4.484560  48.38114  1451559600   
7028642   227574020              0.0 -4.495345  48.38367  1451559600   

        station_name  Ff  
7028641    BRIGNOGAN   4  
7028642    BRIGNOGAN   4  
         sourcemmsi  speedoverground       lon       lat           t  \
2313548   228041600              0.0 -4.473323  48.38236  1446030000   
2313549   246497000              0.3 -4.462735  48

13584413     OUESSANT   6  
          sourcemmsi  speedoverground       lon        lat           t  \
11602761   227941000              0.0 -4.327225  48.100082  1456106400   
11602762   305600000             13.3 -4.771665  48.277832  1456106400   

         station_name  Ff  
11602761    BRIGNOGAN   4  
11602762     OUESSANT   4  
         sourcemmsi  speedoverground       lon        lat           t  \
1371416   228186700              0.0 -4.512498  48.370834  1445047200   

        station_name  Ff  
1371416     OUESSANT   5  
          sourcemmsi  speedoverground       lon        lat           t  \
14506009   477820000              0.1 -4.485786  48.354454  1458990000   
14506010   234056000              0.0 -4.518463  48.367313  1458990000   

         station_name  Ff  
14506009    BRIGNOGAN   5  
14506010     OUESSANT   7  
          sourcemmsi  speedoverground       lon        lat           t  \
12904970   228051000              0.0 -4.485097  48.381320  1457146800   
12904971 

6959534    BRIGNOGAN   6  
         sourcemmsi  speedoverground       lon       lat           t  \
5657837   227380000              0.0 -4.325932  48.09825  1449594000   

        station_name  Ff  
5657837    BRIGNOGAN   6  
         sourcemmsi  speedoverground       lon       lat           t  \
2167441   228799000              0.0 -4.490473  48.38133  1445907600   

        station_name  Ff  
2167441    BRIGNOGAN   3  
          sourcemmsi  speedoverground       lon        lat           t  \
3837832    228817100             10.4 -4.538565  48.352665  1447750800   
18346326   227362110              0.1 -4.489954  48.379265  1447750800   

         station_name  Ff  
3837832      OUESSANT   5  
18346326    BRIGNOGAN   5  
         sourcemmsi  speedoverground       lon        lat           t  \
9781727   226338000             17.7 -4.879697  48.529934  1454320800   

        station_name  Ff  
9781727     OUESSANT   5  
          sourcemmsi  speedoverground       lon        lat         

15286574    BRIGNOGAN   3  
          sourcemmsi  speedoverground       lon        lat           t  \
4181765    235050609              9.4 -4.834498  47.863500  1448046000   
15629827   259019000              0.0 -4.514329  48.368973  1448046000   

         station_name  Ff  
4181765      OUESSANT   6  
15629827     OUESSANT   6  
         sourcemmsi  speedoverground       lon        lat           t  \
1673647   227705102              0.1 -4.495462  48.383663  1445400000   

        station_name  Ff  
1673647    BRIGNOGAN   3  
          sourcemmsi  speedoverground      lon        lat           t  \
10115801   227574020              0.0 -4.49669  48.382492  1454616000   

         station_name  Ff  
10115801    BRIGNOGAN   4  
          sourcemmsi  speedoverground      lon        lat           t  \
13920589   227635210             18.3 -4.44785  48.298794  1458302400   

         station_name  Ff  
13920589    BRIGNOGAN   5  
         sourcemmsi  speedoverground       lon       lat  

15820511     OUESSANT   6  
         sourcemmsi  speedoverground       lon        lat           t  \
6558400   227705102              0.0 -4.497046  48.382206  1450749600   

        station_name  Ff  
6558400    BRIGNOGAN   5  
          sourcemmsi  speedoverground       lon        lat           t  \
13750383   227705102              0.0 -4.495359  48.383656  1458122400   

         station_name  Ff  
13750383    BRIGNOGAN   7  
          sourcemmsi  speedoverground       lon        lat           t  \
16658651   227002630             13.5 -4.601465  48.088776  1452592800   

         station_name  Ff  
16658651     OUESSANT   7  
        sourcemmsi  speedoverground       lon       lat           t  \
820889   228017700             13.6 -4.596965  48.33753  1444417200   

       station_name  Ff  
820889     OUESSANT   5  
         sourcemmsi  speedoverground       lon       lat           t  \
2467857   227705102              0.0 -4.495343  48.38368  1446260400   

        station_name 

11283498     OUESSANT   5  
          sourcemmsi  speedoverground       lon        lat           t  \
9413717    228302900              8.0 -4.407102  48.210700  1453928400   
9413718    228186700            102.3 -4.512498  48.370834  1453928400   
16937518   227142200              8.2 -4.472265  48.162770  1453928400   

         station_name  Ff  
9413717     BRIGNOGAN   5  
9413718      OUESSANT   5  
16937518     OUESSANT   5  
         sourcemmsi  speedoverground       lon       lat           t  \
7364156   227574020              0.0 -4.495333  48.38366  1452085200   

        station_name  Ff  
7364156    BRIGNOGAN   4  
         sourcemmsi  speedoverground       lon        lat           t  \
2738114   228813000              8.9 -4.814529  48.042645  1446555600   

        station_name  Ff  
2738114     OUESSANT   3  
         sourcemmsi  speedoverground       lon        lat           t  \
1048351   227005550             17.8 -4.487735  48.373184  1444712400   

        station_

6986183    BRIGNOGAN   5  
          sourcemmsi  speedoverground       lon        lat           t  \
5701595    577228000              2.2 -4.787382  48.162647  1449644400   
5701596    227580520             18.3 -4.418570  48.299053  1449644400   
15986830   227635210             18.0 -4.499557  48.297200  1449644400   
15986831   228336000              1.9 -4.771282  48.140343  1449644400   
18483475   227578460              7.8 -4.446570  48.362570  1449644400   

         station_name  Ff  
5701595      OUESSANT   4  
5701596     BRIGNOGAN   3  
15986830     OUESSANT   4  
15986831     OUESSANT   4  
18483475    BRIGNOGAN   3  
          sourcemmsi  speedoverground       lon        lat           t  \
3879791    228051000              0.0 -4.485097  48.381306  1447801200   
3879792    226263000              0.0 -4.485727  48.379830  1447801200   
15573037   226318000             16.8 -4.785665  48.318832  1447801200   

         station_name  Ff  
3879791     BRIGNOGAN   5  
3879792

         sourcemmsi  speedoverground       lon       lat           t  \
6764170   227002630              0.1 -4.848587  48.03908  1451127600   
6764171   228051000              0.0 -4.485086  48.38134  1451127600   

        station_name  Ff  
6764170     OUESSANT   6  
6764171    BRIGNOGAN   4  
         sourcemmsi  speedoverground       lon        lat           t  \
5431254   538002973              0.0 -4.471899  48.381500  1449284400   
5431255   226318000             17.1 -4.812332  48.372334  1449284400   

        station_name  Ff  
5431254    BRIGNOGAN   4  
5431255     OUESSANT   6  
       sourcemmsi  speedoverground      lon        lat           t  \
86656   227574020              0.0 -4.49547  48.383633  1443754800   

      station_name  Ff  
86656    BRIGNOGAN   3  
          sourcemmsi  speedoverground       lon        lat           t  \
13412258   227612860             18.5 -4.442552  48.299828  1457697600   
13412259   227574020              0.0 -4.495360  48.383606  14

9978108    BRIGNOGAN   6  
         sourcemmsi  speedoverground       lon        lat           t  \
7792931   227705102              0.2 -4.416161  48.284214  1452668400   
7792932   226263000              8.9 -4.794603  48.268635  1452668400   

        station_name  Ff  
7792931    BRIGNOGAN   3  
7792932     OUESSANT   3  
         sourcemmsi  speedoverground       lon        lat           t  \
6613472   228793000              0.0 -4.516713  48.368885  1450825200   
6613473   228051000              0.0 -4.485068  48.381317  1450825200   

        station_name  Ff  
6613472     OUESSANT   6  
6613473    BRIGNOGAN   5  
         sourcemmsi  speedoverground       lon        lat           t  \
5144521   228344800              0.0 -4.478273  48.383114  1448982000   

        station_name  Ff  
5144521    BRIGNOGAN   2  
          sourcemmsi  speedoverground      lon        lat           t  \
15444010   227612860              8.8 -4.50965  48.295883  1447138800   

         station_name  

11090500    BRIGNOGAN   3  
          sourcemmsi  speedoverground       lon        lat           t  \
16482558   518866000              0.2 -4.479948  48.217632  1451941200   

         station_name  Ff  
16482558     OUESSANT   6  
         sourcemmsi  speedoverground       lon        lat           t  \
6039827   228186700              0.0 -4.442498  48.321667  1450098000   

        station_name  Ff  
6039827    BRIGNOGAN   4  
         sourcemmsi  speedoverground       lon        lat           t  \
2594756   227904000              9.1 -4.498675  48.122593  1446411600   

        station_name  Ff  
2594756     OUESSANT   2  
          sourcemmsi  speedoverground       lon        lat           t  \
10340927   234056000              0.0 -4.518408  48.367325  1454824800   
10340928   228015700              0.1 -4.325120  48.098175  1454824800   
10340929   226178000              0.1 -4.323957  48.098026  1454824800   

         station_name  Ff  
10340927     OUESSANT   7  
10340928    

          sourcemmsi  speedoverground       lon        lat           t  \
15000873   227306100              0.1 -4.588415  48.279854  1444298400   

         station_name  Ff  
15000873     OUESSANT   1  
          sourcemmsi  speedoverground       lon        lat           t  \
13864454   228186700            102.3 -4.512589  48.370834  1458241200   
17870824   227142200              8.3 -4.621540  48.083633  1458241200   

         station_name  Ff  
13864454     OUESSANT   6  
17870824     OUESSANT   6  
          sourcemmsi  speedoverground       lon        lat           t  \
11908402   227705102              0.0 -4.496690  48.382460  1456398000   
11908403   227270000             10.6 -4.648802  48.105915  1456398000   
11908404   228762000              8.9 -4.498172  48.341385  1456398000   
11908405   227635210              0.0 -4.495327  48.383675  1456398000   

         station_name  Ff  
11908402    BRIGNOGAN   2  
11908403     OUESSANT   2  
11908404     OUESSANT   2  
11908

In [None]:
#deyterh lysh xronos 60 lepta
start_time=time.time()
o={}
for k in c:
    df_ships = df_1.loc[df_1['t']==k]
    df_weather = weather_data.loc[weather_data['local_time']==k]
    df_ships['station_name']=0
    df_ships['Ff'] = 0
    for i in range(len(df_ships)):
        x1 = df_ships.iloc[i]['lon']
        y1 = df_ships.iloc[i]['lat']
        dist=10000000
        for j in range(len(df_weather)):
            x2 = df_weather.iloc[j]['longitude']
            y2 = df_weather.iloc[j]['latitude']
            dist_i = calculateDistance(x1,y1,x2,y2)
        
            if dist_i<dist:
                dist = dist_i
                df_ships.iloc[i,-2]=df_weather.iloc[j]['station_name']
                df_ships.iloc[i,-1]=wind_bft(df_weather.iloc[j]['Ff'])
    o[k]= df_ships
print(time.time()-start_time)

In [None]:
final_data=[]
for i in o.values():
    print(i)
    final_data.append(i)

In [21]:
data = pd.concat(final_data)

In [22]:
teliko=pd.DataFrame(data)

In [23]:
pd.DataFrame(teliko)

Unnamed: 0,sourcemmsi,speedoverground,lon,lat,t,station_name,Ff
13602158,227592820,16.9,-4.483425,48.350903,1457971200,BRIGNOGAN,7.0
13602159,228762000,0.2,-4.506792,48.374588,1457971200,OUESSANT,6.0
11624163,226178000,0.0,-4.325843,48.098220,1456128000,BRIGNOGAN,4.0
6457456,228051000,0.0,-4.485086,48.381320,1450598400,BRIGNOGAN,3.0
6457457,227580520,0.0,-4.497093,48.379566,1450598400,BRIGNOGAN,3.0
...,...,...,...,...,...,...,...
5615944,227019400,0.1,-4.475972,48.294502,1449558000,BRIGNOGAN,3.0
18468762,227611930,7.1,-4.485503,48.361100,1449558000,BRIGNOGAN,3.0
3812502,227297000,0.0,-4.482032,48.380880,1447714800,BRIGNOGAN,7.0
329126,227730220,14.6,-4.487790,48.374786,1444028400,BRIGNOGAN,4.0


In [24]:
teliko_pred=pd.DataFrame(teliko)

In [25]:
map_r1 = spark.createDataFrame(teliko)

In [26]:
map_r1.show()

+----------+---------------+-------------------+------------------+----------+------------+---+
|sourcemmsi|speedoverground|                lon|               lat|         t|station_name| Ff|
+----------+---------------+-------------------+------------------+----------+------------+---+
| 227592820|           16.9|          -4.483425|         48.350903|1457971200|   BRIGNOGAN|7.0|
| 228762000|            0.2|         -4.5067916|         48.374588|1457971200|    OUESSANT|6.0|
| 226178000|            0.0|         -4.3258433|          48.09822|1456128000|   BRIGNOGAN|4.0|
| 228051000|            0.0|         -4.4850864|          48.38132|1450598400|   BRIGNOGAN|3.0|
| 227580520|            0.0|-4.4970930000000005|         48.379566|1450598400|   BRIGNOGAN|3.0|
| 226318000|           19.6|         -4.7592835|48.212703999999995|1450598400|    OUESSANT|5.0|
| 356071000|            0.2| -4.322503599999999|           48.1183|1452441600|   BRIGNOGAN|4.0|
| 228394000|            2.1|          -4

In [27]:
map_r1 = map_r1.rdd

In [28]:
map_r1.getNumPartitions()

8

In [29]:
map_r1.map(lambda x: tuple([x[0], (x[6])])).reduceByKey(max).collect()

[(228762000, 9.0),
 (226178000, 9.0),
 (228051000, 9.0),
 (227580520, 9.0),
 (226318000, 9.0),
 (356071000, 7.0),
 (228394000, 7.0),
 (234056000, 9.0),
 (220540000, 7.0),
 (227904000, 6.0),
 (226263000, 9.0),
 (228041600, 7.0),
 (228793000, 7.0),
 (228336000, 7.0),
 (227999400, 6.0),
 (227142200, 7.0),
 (249297000, 6.0),
 (228109000, 7.0),
 (314255000, 4.0),
 (228796000, 6.0),
 (311166000, 3.0),
 (227941000, 8.0),
 (227091000, 9.0),
 (219124000, 7.0),
 (259019000, 9.0),
 (228032800, 7.0),
 (245181000, 5.0),
 (367852000, 4.0),
 (227143600, 7.0),
 (227148000, 6.0),
 (228144000, 6.0),
 (228293000, 5.0),
 (314237000, 6.0),
 (228208800, 7.0),
 (228068600, 6.0),
 (228150000, 5.0),
 (249959000, 4.0),
 (227392000, 8.0),
 (228344800, 8.0),
 (227019400, 6.0),
 (304927000, 6.0),
 (305600000, 7.0),
 (228043800, 9.0),
 (227485000, 2.0),
 (227297000, 7.0),
 (228240000, 4.0),
 (226177000, 7.0),
 (227577000, 6.0),
 (227270000, 6.0),
 (249616000, 6.0),
 (227641920, 7.0),
 (228236600, 9.0),
 (226338000,

In [30]:
map_r1.map(lambda x: tuple([x[0], (x[6])])).reduceByKey(min).collect()

[(228762000, 0.0),
 (226178000, 0.0),
 (228051000, 2.0),
 (227580520, 2.0),
 (226318000, 2.0),
 (356071000, 2.0),
 (228394000, 2.0),
 (234056000, 1.0),
 (220540000, 3.0),
 (227904000, 2.0),
 (226263000, 2.0),
 (228041600, 2.0),
 (228793000, 2.0),
 (228336000, 2.0),
 (227999400, 3.0),
 (227142200, 2.0),
 (249297000, 2.0),
 (228109000, 0.0),
 (314255000, 4.0),
 (228796000, 3.0),
 (311166000, 2.0),
 (227941000, 1.0),
 (227091000, 2.0),
 (219124000, 5.0),
 (259019000, 0.0),
 (228032800, 3.0),
 (245181000, 0.0),
 (367852000, 3.0),
 (227143600, 3.0),
 (227148000, 3.0),
 (228144000, 2.0),
 (228293000, 2.0),
 (314237000, 5.0),
 (228208800, 2.0),
 (228068600, 2.0),
 (228150000, 4.0),
 (249959000, 4.0),
 (227392000, 2.0),
 (228344800, 2.0),
 (227019400, 2.0),
 (304927000, 6.0),
 (305600000, 3.0),
 (228043800, 2.0),
 (227485000, 2.0),
 (227297000, 0.0),
 (228240000, 4.0),
 (226177000, 2.0),
 (227577000, 2.0),
 (227270000, 2.0),
 (249616000, 6.0),
 (227641920, 2.0),
 (228236600, 6.0),
 (226338000,

In [31]:
map_r1.map(lambda x: tuple([x[0]])).countByKey()

defaultdict(int,
            {227592820: 15,
             228762000: 209,
             226178000: 66,
             228051000: 417,
             227580520: 230,
             226318000: 45,
             356071000: 22,
             228394000: 27,
             228064900: 38,
             228186700: 307,
             234056000: 209,
             227009310: 16,
             227003050: 198,
             227574020: 288,
             220540000: 11,
             227904000: 7,
             226263000: 84,
             228041600: 14,
             227306100: 16,
             227319570: 8,
             228793000: 13,
             228336000: 45,
             227999400: 21,
             227705102: 231,
             227142200: 24,
             228155700: 3,
             249297000: 14,
             228109000: 75,
             314255000: 1,
             228796000: 3,
             227362110: 10,
             227005550: 21,
             311166000: 2,
             227941000: 171,
             227091000: 24,


In [32]:
teliko_pred=teliko_pred.loc[teliko_pred['sourcemmsi']==228051000]

In [33]:
teliko_pred.drop(['sourcemmsi','lon','lat','station_name'],axis = 1, inplace=True)

In [34]:
pd.DataFrame(teliko_pred)

Unnamed: 0,speedoverground,t,Ff
6457456,0.0,1450598400,3.0
7092209,0.0,1451638800,5.0
6619363,0.0,1450836000,5.0
5381644,0.0,1449230400,3.0
6886940,0.0,1451311200,5.0
...,...,...,...
3178036,0.0,1447002000,5.0
4177828,0.4,1448042400,6.0
6688401,0.1,1450926000,4.0
6938257,0.0,1451401200,4.0


In [35]:
map_r2 = spark.createDataFrame(teliko_pred)

In [36]:
map_r2.show()

+---------------+----------+---+
|speedoverground|         t| Ff|
+---------------+----------+---+
|            0.0|1450598400|3.0|
|            0.0|1451638800|5.0|
|            0.0|1450836000|5.0|
|            0.0|1449230400|3.0|
|            0.0|1451311200|5.0|
|            0.1|1448665200|7.0|
|            0.0|1455235200|4.0|
|            0.8|1448902800|5.0|
|            0.0|1449140400|4.0|
|            0.0|1448337600|3.0|
|            0.0|1447534800|5.0|
|            0.0|1450893600|3.0|
|            0.0|1447207200|2.0|
|            0.0|1448247600|4.0|
|            0.1|1451131200|4.0|
|            0.0|1447444800|6.0|
|            0.0|1448722800|5.0|
|            0.0|1455530400|5.0|
|            0.0|1459454400|4.0|
|            0.0|1452081600|5.0|
+---------------+----------+---+
only showing top 20 rows



In [37]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

In [38]:
transformed = transData(map_r2)
transformed.show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[0.0,1.4505984E9]|  3.0|
|[0.0,1.4516388E9]|  5.0|
| [0.0,1.450836E9]|  5.0|
|[0.0,1.4492304E9]|  3.0|
|[0.0,1.4513112E9]|  5.0|
|[0.1,1.4486652E9]|  7.0|
|[0.0,1.4552352E9]|  4.0|
|[0.8,1.4489028E9]|  5.0|
|[0.0,1.4491404E9]|  4.0|
|[0.0,1.4483376E9]|  3.0|
|[0.0,1.4475348E9]|  5.0|
|[0.0,1.4508936E9]|  3.0|
|[0.0,1.4472072E9]|  2.0|
|[0.0,1.4482476E9]|  4.0|
|[0.1,1.4511312E9]|  4.0|
|[0.0,1.4474448E9]|  6.0|
|[0.0,1.4487228E9]|  5.0|
|[0.0,1.4555304E9]|  5.0|
|[0.0,1.4594544E9]|  4.0|
|[0.0,1.4520816E9]|  5.0|
+-----------------+-----+
only showing top 20 rows



In [39]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer
featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)

In [40]:
data.show()

+-----------------+-----+-----------------+
|         features|label|  indexedFeatures|
+-----------------+-----+-----------------+
|[0.0,1.4505984E9]|  3.0|[0.0,1.4505984E9]|
|[0.0,1.4516388E9]|  5.0|[0.0,1.4516388E9]|
| [0.0,1.450836E9]|  5.0| [0.0,1.450836E9]|
|[0.0,1.4492304E9]|  3.0|[0.0,1.4492304E9]|
|[0.0,1.4513112E9]|  5.0|[0.0,1.4513112E9]|
|[0.1,1.4486652E9]|  7.0|[0.1,1.4486652E9]|
|[0.0,1.4552352E9]|  4.0|[0.0,1.4552352E9]|
|[0.8,1.4489028E9]|  5.0|[0.8,1.4489028E9]|
|[0.0,1.4491404E9]|  4.0|[0.0,1.4491404E9]|
|[0.0,1.4483376E9]|  3.0|[0.0,1.4483376E9]|
|[0.0,1.4475348E9]|  5.0|[0.0,1.4475348E9]|
|[0.0,1.4508936E9]|  3.0|[0.0,1.4508936E9]|
|[0.0,1.4472072E9]|  2.0|[0.0,1.4472072E9]|
|[0.0,1.4482476E9]|  4.0|[0.0,1.4482476E9]|
|[0.1,1.4511312E9]|  4.0|[0.1,1.4511312E9]|
|[0.0,1.4474448E9]|  6.0|[0.0,1.4474448E9]|
|[0.0,1.4487228E9]|  5.0|[0.0,1.4487228E9]|
|[0.0,1.4555304E9]|  5.0|[0.0,1.4555304E9]|
|[0.0,1.4594544E9]|  4.0|[0.0,1.4594544E9]|
|[0.0,1.4520816E9]|  5.0|[0.0,1.

In [41]:
(trainingData, testData) = data.randomSplit([0.6, 0.4])

In [42]:
trainingData.show()
testData.show()

+-----------------+-----+-----------------+
|         features|label|  indexedFeatures|
+-----------------+-----+-----------------+
|[0.0,1.4472072E9]|  2.0|[0.0,1.4472072E9]|
|[0.0,1.4474448E9]|  6.0|[0.0,1.4474448E9]|
|[0.0,1.4475348E9]|  5.0|[0.0,1.4475348E9]|
|[0.0,1.4477976E9]|  5.0|[0.0,1.4477976E9]|
|[0.0,1.4482728E9]|  2.0|[0.0,1.4482728E9]|
|[0.0,1.4483376E9]|  3.0|[0.0,1.4483376E9]|
|[0.0,1.4487228E9]|  5.0|[0.0,1.4487228E9]|
|[0.0,1.4487804E9]|  6.0|[0.0,1.4487804E9]|
| [0.0,1.448928E9]|  5.0| [0.0,1.448928E9]|
| [0.0,1.449018E9]|  3.0| [0.0,1.449018E9]|
|[0.0,1.4492304E9]|  3.0|[0.0,1.4492304E9]|
|[0.0,1.4493456E9]|  4.0|[0.0,1.4493456E9]|
|[0.0,1.4494032E9]|  5.0|[0.0,1.4494032E9]|
|[0.0,1.4495508E9]|  4.0|[0.0,1.4495508E9]|
|[0.0,1.4495832E9]|  5.0|[0.0,1.4495832E9]|
| [0.0,1.450836E9]|  5.0| [0.0,1.450836E9]|
|[0.0,1.4508936E9]|  3.0|[0.0,1.4508936E9]|
|[0.0,1.4509188E9]|  4.0|[0.0,1.4509188E9]|
|[0.0,1.4511564E9]|  4.0|[0.0,1.4511564E9]|
|[0.0,1.4513364E9]|  4.0|[0.0,1.

In [43]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [44]:
pipeline = Pipeline(stages=[featureIndexer, dt])

model = pipeline.fit(trainingData)

In [46]:
predictions = model.transform(testData)

In [47]:
predictions.show(predictions.count(), False)

+-----------------+-----+-----------------+-----------------+
|features         |label|indexedFeatures  |prediction       |
+-----------------+-----+-----------------+-----------------+
|[0.0,1.4471748E9]|3.0  |[0.0,1.4471748E9]|3.2              |
|[0.0,1.4479776E9]|5.0  |[0.0,1.4479776E9]|4.976190476190476|
|[0.0,1.4482476E9]|4.0  |[0.0,1.4482476E9]|4.976190476190476|
|[0.0,1.4486904E9]|6.0  |[0.0,1.4486904E9]|4.976190476190476|
|[0.0,1.4491404E9]|4.0  |[0.0,1.4491404E9]|4.271604938271605|
|[0.0,1.4494356E9]|3.0  |[0.0,1.4494356E9]|4.271604938271605|
|[0.0,1.4494932E9]|5.0  |[0.0,1.4494932E9]|4.271604938271605|
|[0.0,1.4505984E9]|3.0  |[0.0,1.4505984E9]|4.271604938271605|
|[0.0,1.4508612E9]|3.0  |[0.0,1.4508612E9]|4.271604938271605|
|[0.0,1.4513112E9]|5.0  |[0.0,1.4513112E9]|4.875            |
|[0.0,1.451574E9] |4.0  |[0.0,1.451574E9] |4.363636363636363|
|[0.0,1.4516316E9]|4.0  |[0.0,1.4516316E9]|4.363636363636363|
|[0.0,1.4520816E9]|5.0  |[0.0,1.4520816E9]|4.948717948717949|
|[0.0,1.

In [48]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.36246


In [49]:
treeModel = model.stages[1]
print(treeModel)

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4cd38c80fda17e41428a) of depth 5 with 35 nodes
