## Naive Bayes on Mobility

In [3]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
# Import the dataset for mobility
mobility = pd.read_csv("../data/usitc/border_crossing_entry_data_trade.csv")

In [5]:
mobility.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,imports,exports
0,Alexandria Bay,NY,708,US-Canada Border,12/1/2019 0:00,Bus Passengers,740,,
1,Baudette,MN,3424,US-Canada Border,12/1/2019 0:00,Bus Passengers,83,,
2,Blaine,WA,3004,US-Canada Border,12/1/2019 0:00,Bus Passengers,17029,,
3,Brownsville,TX,2301,US-Mexico Border,12/1/2019 0:00,Bus Passengers,4668,358000000000.0,196000000000.0
4,Buffalo-Niagara Falls,NY,901,US-Canada Border,12/1/2019 0:00,Bus Passengers,9910,,


In [6]:
# Include only mobility on the US-MX border
borderMX = mobility[(mobility['Border']=="US-Mexico Border")]

In [7]:
borderMX.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,imports,exports
3,Brownsville,TX,2301,US-Mexico Border,12/1/2019 0:00,Bus Passengers,4668,358000000000.0,196000000000.0
6,Calexico East,CA,2507,US-Mexico Border,12/1/2019 0:00,Bus Passengers,4080,358000000000.0,196000000000.0
8,Columbus,NM,2406,US-Mexico Border,12/1/2019 0:00,Bus Passengers,1452,358000000000.0,196000000000.0
11,Douglas,AZ,2601,US-Mexico Border,12/1/2019 0:00,Bus Passengers,107,358000000000.0,196000000000.0
13,Eagle Pass,TX,2303,US-Mexico Border,12/1/2019 0:00,Bus Passengers,8525,358000000000.0,196000000000.0


In [8]:
# Extract year and consolidate year
import datetime
borderMX['year']= pd.DatetimeIndex(borderMX['Date']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Encoding the Categorical Variable State

In [9]:
df = borderMX[["year","State","Port Code","Value"]].copy()

In [10]:
X1 = df.iloc[:,:-1].values
y = df.iloc[:,3].values

In [11]:
# Encoding categorical data
# Encoding the independent variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [12]:
labelencoder = LabelEncoder()
X1[:, 1] = labelencoder.fit_transform(X1[:, 1])
onehotencoder = OneHotEncoder(categorical_features = [1])
X1 = onehotencoder.fit_transform(X1).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [13]:
# California is Index 1, KEEP
print(X1.shape)

(82288, 6)


In [14]:
print(X1)

[[0.000e+00 0.000e+00 0.000e+00 1.000e+00 2.019e+03 2.301e+03]
 [0.000e+00 1.000e+00 0.000e+00 0.000e+00 2.019e+03 2.507e+03]
 [0.000e+00 0.000e+00 1.000e+00 0.000e+00 2.019e+03 2.406e+03]
 ...
 [0.000e+00 0.000e+00 1.000e+00 0.000e+00 1.996e+03 2.408e+03]
 [0.000e+00 0.000e+00 0.000e+00 1.000e+00 1.996e+03 2.403e+03]
 [0.000e+00 1.000e+00 0.000e+00 0.000e+00 1.996e+03 2.505e+03]]


In [15]:
# Re-establishing the X matrix
X = X1[:,[1,4,5]]
print(X)

[[0.000e+00 2.019e+03 2.301e+03]
 [1.000e+00 2.019e+03 2.507e+03]
 [0.000e+00 2.019e+03 2.406e+03]
 ...
 [0.000e+00 1.996e+03 2.408e+03]
 [0.000e+00 1.996e+03 2.403e+03]
 [1.000e+00 1.996e+03 2.505e+03]]


## Starting Naive Bayes

In [16]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [17]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [18]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [19]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
# Visualizing the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 2].min() - 1, stop = X_set[:, 2].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Naive Bayes (Training set)')
plt.xlabel('Year')
plt.ylabel('Port COde')
plt.legend()
plt.show()

ValueError: operands could not be broadcast together with shapes (77250000,2) (3,) 

In [28]:

# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01),
                      np.arange(start = X_set[:, 2].min() - 1, stop = X_set[:, 2].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Naive Bayes (Test set)')
plt.xlabel('Age')
plt.ylabel('Port Code')
plt.legend()
plt.show()

ValueError: operands could not be broadcast together with shapes (77250000,2) (3,) 

In [None]:
print("hello")

In [None]:
print("hello!")

In [22]:
y_test.shape

(16458,)

In [24]:
X_train.shape

(65830, 3)

In [25]:
X.shape

(82288, 3)

In [26]:
X_test.shape

(16458, 3)

In [27]:
y_pred.shape

(16458,)