In [23]:
import pandas as pd
import numpy as np
import os
import random as rd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import random
import sklearn

#change working directory
os.chdir("/Volumes/Seagate/Datamyne Data")
#set pandas defaults for number of rows and columns to display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

#read in data
#dataframe with text description being the features, and hs code being the target
df = pd.read_csv('/Volumes/Seagate/Datamyne Data/export_not00_trainset.csv')

df = df[pd.notnull(df['Short Container Description'])] #drop missing
# df['ProductID'] = df['Product'].factorize()[0] #translate category into integer variables
df = df[:10000] #grab subset to reduce compute time

# Generate the 6 and 2 digits version of HS
df['HS6'] = df.HS.str[:6]
df['HS2'] = df.HS6.str[:2]
df.fillna(0)
df

Unnamed: 0,Short Container Description,HS,counter,HS6,HS2
0,APPLES,08,2529.0,08,08
1,WOODPULP,47,1776.0,47,47
2,WASTEPAPER,47,1284.0,47,47
3,1 UNPACKED OR UNPACKAGED OF EMPTY CONTAINER 1 ...,EC,1277.0,EC,EC
4,(001) ONE 40' X 8' X 9'6 HIGH CUBE SLAC E M P ...,EC,1049.0,EC,EC
...,...,...,...,...,...
9995,EDIBLE PREPARATIONS TEMPERATURE SET AT 19.0 C ...,210690,3.0,210690,21
9996,USED CLOTHING HS CODE: 6309.00.0000 ; ALL FRE ...,630900,3.0,630900,63
9997,"SHIPPER'S LOAD, STOW, WEIGHT, COUNT AND SEAL 1...",940370,3.0,940370,94
9998,34 SKID MOTORCYCLES ALL PRODUCT PACKAGING IS I...,8711,3.0,8711,87


In [28]:
# Utility function to report best scores
def report(results, n_top=20):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(np.array(results['rank_test_score']) == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
    #need to make a ranking
    n=0
    for i in results['rank_test_score']:
        if i==1:
            top=n
        n=n+1
    return top


In [33]:
#split data
from sklearn.model_selection import train_test_split

# seed = rd.randint(0,1000000) #set the seed randomly, but then hold it constant: 445134
# np.savetxt('seed.csv', [seed]) #this will ensure that we are working on the same split each time
seed = int(np.loadtxt('seed.csv'))

X = df['Short Container Description'][:100000]
y = df['HS'][:100000]
z = df['HS2'][:100000]
xTrain, xTest, yTrain, yTest,zTrain,zTest = train_test_split(X, y,z, test_size=0.33, random_state=seed)

In [34]:
#write a function that does all of our text transformation
#this will be called within the following pipelines
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

class cleaner(BaseEstimator, TransformerMixin):
    #Class Constructor 
    
    def __init__(self, train):
        self.train = train
        
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self
    
    
    def transform(self, X):
        #any cleaning we want to do can go here
        
        #transform text data into vector fit on trained data (from vec)
        vec=TfidfVectorizer().fit(self.train)
        xClean = vec.transform(X)
        
        return xClean



In [41]:
#fit base model
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

#contruct a pipeline that first cleans/transforms the data, then estimates the model, in this case LinerSVC 
model = Pipeline([('clean',cleaner(xTrain)),('est',LinearSVC())])

#fit the model
model.fit(xTrain,yTrain)

#predict y using fitted model for in and out of traing sample
inSample = model.predict(xTrain)
outSample = model.predict(xTest)

# We calculate two types of accuracy, the first one is the accuracy based on full HS code, and the second one only consider first two digits of HS code
print('6 digits Accuracy (in sample): %.2f' % accuracy_score(yTrain, inSample))
print()
print('6 digits Accuracy (out of sample): %.2f' % accuracy_score(yTest, outSample))


print()
inSample_2 = inSample.astype('U2')
outSample_2 = outSample.astype('U2')
print('2 digits Accuracy (in sample): %.2f' % accuracy_score(zTrain, inSample_2))
print()
print('2 digits Accuracy (out of sample): %.2f' % accuracy_score(zTest, outSample_2))

6 digits Accuracy (in sample): 0.98

6 digits Accuracy (out of sample): 0.67
2 digits Accuracy (in sample): 0.99

2 digits Accuracy (out of sample): 0.84


In [6]:
#hyperparameter tuning, test a bunch of sets of parameters using bayes search algorithm
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
from time import time
from sklearn.model_selection import KFold

model = Pipeline([('clean',cleaner(xTrain)),('est',LinearSVC())])

#number of splits for cross validation for each set of parameters
kf = KFold(n_splits=3)

#parameters and bounds to search over
search_spaces = {'est__C': Real(0,10)}

#number of iterations over the search space
n_iter_search = 10

bayes_search = BayesSearchCV(model,
                    search_spaces,
                    cv=kf,
                    n_iter=n_iter_search,
                    n_jobs=1,
                    optimizer_kwargs={'base_estimator': 'GP'})
start = time()
bayes_search.fit(xTrain,yTrain)

print("BayesSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), len(bayes_search.cv_results_['rank_test_score'])))
print()
report(bayes_search.cv_results_)

inSample = bayes_search.best_estimator_.predict(xTrain)
outSample = bayes_search.best_estimator_.predict(xTest)

print('__________________________________')
print()
print('Best Params: ', bayes_search.best_params_)
print()

print('Accuracy (in sample): %.2f' % accuracy_score(yTrain, inSample))
print()
print('Accuracy (out of sample): %.2f' % accuracy_score(yTest, outSample))

BayesSearchCV took 170.33 seconds for 10 candidates parameter settings.

Model with rank: 1
Mean validation score: 0.630 (std: 0.004)
Parameters: OrderedDict([('est__C', 3.802444997808073)])

Model with rank: 2
Mean validation score: 0.630 (std: 0.005)
Parameters: OrderedDict([('est__C', 3.913782691013028)])

Model with rank: 3
Mean validation score: 0.630 (std: 0.006)
Parameters: OrderedDict([('est__C', 2.1156409859325227)])

Model with rank: 4
Mean validation score: 0.628 (std: 0.002)
Parameters: OrderedDict([('est__C', 8.345388824738793)])

Model with rank: 5
Mean validation score: 0.627 (std: 0.003)
Parameters: OrderedDict([('est__C', 7.40997941626861)])

Model with rank: 6
Mean validation score: 0.627 (std: 0.002)
Parameters: OrderedDict([('est__C', 7.669323426813788)])

Model with rank: 7
Mean validation score: 0.627 (std: 0.002)
Parameters: OrderedDict([('est__C', 9.486275109553032)])

Model with rank: 8
Mean validation score: 0.627 (std: 0.002)
Parameters: OrderedDict([('est__C

OrderedDict([('est__C', 3.802444997808073)])

In [11]:
# Fit the model to our HS=00 dataset
df_00 = pd.read_csv('/Volumes/Seagate/Datamyne Data/Export_00.csv')
df_00 = df_00[pd.notnull(df_00['Short Container Description'])]
df_00 = df_00[:1000] #grab subset to reduce compute time
df_00

Unnamed: 0,Short Container Description,counter
0,FAK,11673.0
1,1 40' MILITARY DRY NO EEI PER 30.39 FREIGHT PR...,10320.0
2,AUTOMOBILE,8753.0
3,OFFICE AND LABORATORY SUPPLIES,6844.0
4,OFFICE AND LABORATORY SUPPLIES FREIGHT COLLECT,6616.0
...,...,...
995,"POULTRY, CHICKEN, TURKEY, DUCK, FOWL, FROZENN /A",78.0
996,GDSM GENERAL DEPT STORE MERCHANDISE,78.0
997,FABRIC TO BE SEWN AND RETURNED TO USA TELA PAR...,78.0
998,NON HAZ SYNTHETIC RESIN LEXANN/A,78.0


In [12]:
x = df_00['Short Container Description']
y = bayes_search.best_estimator_.predict(x)
export00_result = pd.DataFrame(
    {'descrption': x,
     'HS_pred': y,
    })
export00_result


Unnamed: 0,descrption,HS_pred
0,FAK,48
1,1 40' MILITARY DRY NO EEI PER 30.39 FREIGHT PR...,98
2,AUTOMOBILE,8703
3,OFFICE AND LABORATORY SUPPLIES,392690
4,OFFICE AND LABORATORY SUPPLIES FREIGHT COLLECT,392690
...,...,...
995,"POULTRY, CHICKEN, TURKEY, DUCK, FOWL, FROZENN /A",0207
996,GDSM GENERAL DEPT STORE MERCHANDISE,950640
997,FABRIC TO BE SEWN AND RETURNED TO USA TELA PAR...,61
998,NON HAZ SYNTHETIC RESIN LEXANN/A,39


In [13]:
# Get the corresponding HS descrption from predicted HS code

# Load the HS codebook
hscode_table=pd.read_csv("/Users/zhaomengshan/Desktop/Export_Import_project/htsdata.csv")
hscode_table['HS'] =hscode_table['HS'].str.replace(r'.', '')
hscode_table

Unnamed: 0,HS,Indent,Description,Unit of Quantity,General Rate of Duty,Special Rate of Duty,Column 2 Rate of Duty,Quota Quantity,Additional Duties
0,101,0,"Live horses, asses, mules and hinnies:",,,,,,
1,,1,Horses:,,,,,,
2,01012100,2,Purebred breeding animals,,Free,,Free,,
3,0101210010,3,Males,"[""No.""]",,,,,
4,0101210020,3,Females,"[""No.""]",,,,,
...,...,...,...,...,...,...,...,...,...
35217,99225209,2,Goods of Mexico or goods of the United States ...,,,Free (S+),,,
35218,99225210,2,Goods of Canada provided for in note 2(b) to t...,,,3.9¢/kg (S+),,,
35219,,1,Goods provided for in subheading 5203.00.30:,,,,,,
35220,99225211,2,Goods of Mexico or goods of the United States ...,,,Free (S+),,,


In [59]:
hscode_table.merge(export00_result, left_on='HS', right_on='HS_pred')[['Description','descrption']]

Unnamed: 0,Description,descrption


In [55]:
# Try other methods rather than pipeline model

# Logistic/ RandomForest/ Linear Support Vector Machine/ Naive Bayes


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df['Short Container Description']).toarray()
labels = df.HS
features.shape


count_vect = CountVectorizer()
xTrain_counts = count_vect.fit_transform(xTrain)
tfidf_transformer = TfidfTransformer()
xTrain_tfidf = tfidf_transformer.fit_transform(xTrain_counts)
clf = MultinomialNB().fit(xTrain_tfidf, yTrain)


In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()



model_name
LinearSVC                 0.6714
LogisticRegression        0.5064
MultinomialNB             0.3552
RandomForestClassifier    0.2162
Name: accuracy, dtype: float64