In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv

In [2]:
predict = pd.read_csv("voyages.csv")
predict

Unnamed: 0,vessel,begin_date,end_date,begin_port_id,end_port_id
0,1,2019-01-01 02:47:00,2019-01-17 04:14:00,138,102.0
1,1,2019-01-17 04:14:00,2019-01-17 13:56:00,102,102.0
2,1,2019-01-17 13:56:00,2019-02-04 07:57:00,102,138.0
3,1,2019-02-04 07:57:00,2019-02-05 07:45:00,138,138.0
4,1,2019-02-05 07:45:00,2019-02-09 07:48:00,138,138.0
...,...,...,...,...,...
998,176,2019-03-27 12:59:00,2019-03-27 21:55:00,117,117.0
999,176,2019-03-27 21:55:00,2019-06-20 04:30:36,117,64.0
1000,176,2019-06-20 04:30:36,2019-06-26 10:35:11,64,20.0
1001,176,2019-06-26 10:35:11,2019-12-02 21:48:41,20,136.0


In [3]:
predict = predict.drop(["begin_date", "end_date"], axis=1)
predict

Unnamed: 0,vessel,begin_port_id,end_port_id
0,1,138,102.0
1,1,102,102.0
2,1,102,138.0
3,1,138,138.0
4,1,138,138.0
...,...,...,...
998,176,117,117.0
999,176,117,64.0
1000,176,64,20.0
1001,176,20,136.0


In [4]:
#Split the data into independent "X" and dependent "Y" variables
X = predict.iloc[:, 0:2].values
Y = predict.iloc[:, 2].values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [6]:
#scale data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [7]:
#Create a function with many machine learning models
def models(X_train, Y_train):
    
    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state = 0)
    log.fit(X_train, Y_train)
    
    #Use KNeighbors
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
    knn.fit(X_train, Y_train)
    
    #use SVC (linear kernel)
    from sklearn.svm import SVC
    svc_lin = SVC(kernel="linear", random_state = 0)
    svc_lin.fit(X_train, Y_train)
    
    #Use SVC (RBF kernel)
    from sklearn.svm import SVC
    svc_rbf = SVC(kernel="rbf", random_state = 0)
    svc_rbf.fit(X_train, Y_train)
    
    #Use GaussianNB
    from sklearn.naive_bayes import GaussianNB
    gauss = GaussianNB()
    gauss.fit(X_train, Y_train)
    
    #Use Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    tree.fit(X_train, Y_train)
    
    #use the RandomForestClassifier
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators=20, criterion = 'entropy', random_state = 0)
    forest.fit(X_train, Y_train)
    
    #print the training accuracy for each model
    print('[0]Logistic Regression Training Accuracy: ', log.score(X_train, Y_train))
    print('[1]K Neighbors Training Accuracy: ', knn.score(X_train, Y_train))
    print('[2]SVC Linear Training Accuracy: ', svc_lin.score(X_train, Y_train))
    print('[3]SVC RBF Training Accuracy: ', svc_rbf.score(X_train, Y_train))
    print('[4]Gaussian NB Training Accuracy: ', gauss.score(X_train, Y_train))
    print('[5]Decision Tree Training Accuracy: ', tree.score(X_train, Y_train))
    print('[6]Random Forest Training Accuracy: ', forest.score(X_train, Y_train))
    
    return log, knn, svc_lin, svc_rbf, gauss, tree, forest

In [8]:
#Get and train all of the models
model = models(X_train, Y_train)

[0]Logistic Regression Training Accuracy:  0.11970074812967581
[1]K Neighbors Training Accuracy:  0.39276807980049877
[2]SVC Linear Training Accuracy:  0.1371571072319202
[3]SVC RBF Training Accuracy:  0.17955112219451372
[4]Gaussian NB Training Accuracy:  0.19077306733167082
[5]Decision Tree Training Accuracy:  0.7543640897755611
[6]Random Forest Training Accuracy:  0.7506234413965087


In [71]:
#print prediction for random forest classifier
predictions = model[6].predict(X_test) 
print(predictions)


[ 92.  32. 115.  10.  58.  38. 138.  10.  91. 156. 156. 152. 169.  10.
 156.  97.  30. 138. 138. 138.  33.  10.  10.  10. 101. 156. 138. 134.
 169. 138. 151. 112.  58. 100. 156. 115.  99.   7. 154. 104. 147. 117.
 156. 115.  10. 149.  38. 140. 115. 115.  10.  91.  10. 156.  51.  19.
 104. 120.  14.  36.  99. 169. 140. 165. 151. 134.  10. 156.  99. 111.
 105. 115. 154. 154. 115.  99.  11.  10.  98.  73.  10. 156.  11.  11.
  99.  51.  10.  39.  57. 114. 115.  10.  99.  35.  70. 143.  11. 104.
  30. 139. 151.  90.  91.  99.  72.  99. 169. 169. 169. 134. 120.  90.
  73.  10.  10. 115. 156. 115.  64. 121. 151.  22.  57.  51. 138. 173.
  34. 104. 154.  11. 173.  32. 115.  58.  58.  93.  10.  83.  34. 139.
 138.  10.  39.  99.  19.  99. 115.  51. 138. 138. 104. 138.  93.  99.
 169. 169. 100.  10. 154.  92.  39.  38.  11. 100.  54.  46. 100.  90.
 165. 115. 154. 108. 138.  38.   7.  30.  10.  48.  22. 138.  51. 177.
 151.  82.  76.  99. 157.  92.  99. 165.  90. 156.  76. 138. 146.  76.
 115. 

In [11]:
predict_test = predict
predict_test

Unnamed: 0,vessel,begin_port_id,end_port_id
0,1,138,102.0
1,1,102,102.0
2,1,102,138.0
3,1,138,138.0
4,1,138,138.0
...,...,...,...
998,176,117,117.0
999,176,117,64.0
1000,176,64,20.0
1001,176,20,136.0


In [23]:
predictions

array([ 92.,  32., 115.,  10.,  58.,  38., 138.,  10.,  91., 156., 156.,
       152., 169.,  10., 156.,  97.,  30., 138., 138., 138.,  33.,  10.,
        10.,  10., 101., 156., 138., 134., 169., 138., 151., 112.,  58.,
       100., 156., 115.,  99.,   7., 154., 104., 147., 117., 156., 115.,
        10., 149.,  38., 140., 115., 115.,  10.,  91.,  10., 156.,  51.,
        19., 104., 120.,  14.,  36.,  99., 169., 140., 165., 151., 134.,
        10., 156.,  99., 111., 105., 115., 154., 154., 115.,  99.,  11.,
        10.,  98.,  73.,  10., 156.,  11.,  11.,  99.,  51.,  10.,  39.,
        57., 114., 115.,  10.,  99.,  35.,  70., 143.,  11., 104.,  30.,
       139., 151.,  90.,  91.,  99.,  72.,  99., 169., 169., 169., 134.,
       120.,  90.,  73.,  10.,  10., 115., 156., 115.,  64., 121., 151.,
        22.,  57.,  51., 138., 173.,  34., 104., 154.,  11., 173.,  32.,
       115.,  58.,  58.,  93.,  10.,  83.,  34., 139., 138.,  10.,  39.,
        99.,  19.,  99., 115.,  51., 138., 138., 10

In [28]:
predict_test = predict_test.assign(voyages='NAN')

In [73]:
def random_forest(data):
    

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_4080/3653967114.py, line 2)

In [74]:
predict.to_csv('predict.csv')