In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.model_selection import train_test_split

voyages = pd.read_csv("voyages.csv")
X = voyages[['begin_port_id']]
y = voyages[['end_port_id']]

In [2]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

n_estimators = [600,800,1000]
max_depth = [10,15,20]
min_samples_split = [2]
min_samples_leaf = [5] 


hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(RandomForestClassifier(), hyperF, cv = 3, verbose = 1,   n_jobs = -1)
model = gridF.fit(X, y.values.ravel())

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   33.1s finished


In [4]:
print(model.score(X, y))
pickle.dump(model, open('model.pkl', 'wb'))

0.27502921698480715


In [5]:
predict_list = []

for vessel in voyages.drop_duplicates(subset=['vessel'])['vessel'].values:
    current_port = voyages[voyages.vessel == vessel].iloc[-1]['end_port_id']
    prediction_1 = model.predict([[current_port]])[0]
    prediction_2 = model.predict([[prediction_1]])[0]
    prediction_3 = model.predict([[prediction_2]])[0]
    
    predict_list.append({'vessel': vessel, 'begin_port_id': current_port, 'end_port_id': prediction_1, 'voyage': 1})
    predict_list.append({'vessel': vessel, 'begin_port_id': prediction_1, 'end_port_id': prediction_2, 'voyage': 2})
    predict_list.append({'vessel': vessel, 'begin_port_id': prediction_2, 'end_port_id': prediction_3, 'voyage': 3})
    
predict = pd.DataFrame(predict_list)

In [6]:
predict.to_csv('predict.csv', index=False)


# Testing new stuff

In [2]:
pd.options.mode.chained_assignment = None
tracking_voyages_list = []

tracking = pd.read_csv("tracking.csv").drop_duplicates().sort_values(by='datetime', ascending=True)

for vessel in voyages.drop_duplicates(subset=['vessel'])['vessel'].values:
    for index, row in voyages[voyages.vessel == vessel].iterrows():
        temp_df = tracking[(tracking.vessel == vessel) & (tracking.datetime >= row['begin_date']) & (tracking.datetime <= row['end_date'])]
        temp_df['begin_port_id'] = row['begin_port_id']
        temp_df['end_port_id'] = row['end_port_id']
        tracking_voyages_list.append(temp_df)
final_df = pd.concat(tracking_voyages_list)
final_df.to_csv('MLTracking.csv', index=False)

In [4]:
X = final_df[['lat', 'long', 'heading', 'begin_port_id']]
y = final_df['end_port_id']
X = X.fillna(X.mean())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [6]:
n_estimators = [800]
max_depth = [20]
min_samples_split = [2]
min_samples_leaf = [5] 


hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(RandomForestClassifier(), hyperF, cv = 3, verbose = 1,   n_jobs = -1)
model = gridF.fit(X_train, y_train.values.ravel())

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [7]:
model.score(X_test, y_test)

0.7296535769367659

In [8]:
pickle.dump(model, open('model_multiinput.pkl', 'wb'))

In [5]:
model = pickle.load(open('model_multiinput.pkl', 'rb'))

In [32]:
predict_list = []

for vessel in voyages.drop_duplicates(subset=['vessel'])['vessel'].values:
    lat = tracking[tracking.vessel == vessel].iloc[-1]['lat']
    long = tracking[tracking.vessel == vessel].iloc[-1]['long']
    heading = tracking[tracking.vessel == vessel].iloc[-1]['heading']
    port = voyages[voyages.vessel == vessel].iloc[-1]['end_port_id']
    
    voyage_1 = model.predict([[lat, long, heading, port]])[0]
    voyage_2 = voyages[voyages.begin_port_id == voyage_1]['end_port_id'].value_counts().idxmax()
    voyage_3 = voyages[voyages.begin_port_id == voyage_2]['end_port_id'].value_counts().idxmax()
    
    predict_list.append({'vessel': vessel, 'begin_port_id': port, 'end_port_id': voyage_1, 'voyage': 1})
    predict_list.append({'vessel': vessel, 'begin_port_id': voyage_1, 'end_port_id': voyage_2, 'voyage': 2})
    predict_list.append({'vessel': vessel, 'begin_port_id': voyage_2, 'end_port_id': voyage_3, 'voyage': 3})
predict = pd.DataFrame(predict_list)

In [36]:
predict = predict[['vessel', 'begin_port_id', 'end_port_id', 'voyage']]
predict.to_csv('predict.csv', index=False)


In [37]:
pd.read_csv('predict.csv')

Unnamed: 0,vessel,begin_port_id,end_port_id,voyage
0,131,149,104,1
1,131,104,14,2
2,131,14,104,3
3,15,10,138,1
4,15,138,120,2
5,15,120,138,3
6,129,99,154,1
7,129,154,99,2
8,129,99,154,3
9,127,154,99,1
