In [1]:
import pandas as pd
import numpy as np

In [2]:
file_csv = pd.read_csv('export.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
data = file_csv[["DESCR", "PRODUCT"]]

In [8]:
data.shape

(499999, 2)

In [9]:
data = data.drop_duplicates(subset=["DESCR", "PRODUCT"], keep=False)

In [10]:
data.shape

(92100, 2)

In [11]:
numpy_array = data.as_matrix()
X = numpy_array[:,0]
Y = numpy_array[:,1]

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
 ('tfidf', TfidfTransformer()),
 ('clf', MultinomialNB()),
])

In [14]:
text_clf = text_clf.fit(X_train,Y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.8524972855591748

In [15]:
X_test

array(['1807 @4409@ Dexamethasone 4mg/ml  per ml (SA)',
       '6211:Prednisone  5mg #51759 @8440@ Prednisone  5mg #51759',
       'inventory:CERENIA INJECTION @11560@ CERENIA INJECTION', ...,
       '2388 @9211@ Cerenia 60Mg Tabs',
       'CLAV62.5:Clavamox 62.5mg @4264@ Clavamox 62.5mg',
       '3637:Prednisone 5mg tablet #121100 @9629@ Prednisone 5mg tablet #121100'],
      dtype=object)

In [16]:
X_test[[8]]

array(['{B4838741-8A21-4A6D-9223-56F963D4591B}:Cerenia 24 mg tabs box of 4 @2209@ Cerenia 24 mg tabs box of 4'],
      dtype=object)

In [17]:
text_clf.predict(X_test[[8]])

array(['Cerenia Tablet'], dtype='|S39')

In [18]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3,
                                                   n_iter=5, random_state=42))])

In [19]:
text_clf_svm = text_clf_svm.fit(X_train, Y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == Y_test)



0.8785830618892508

In [20]:
X_test[[8]]

array(['{B4838741-8A21-4A6D-9223-56F963D4591B}:Cerenia 24 mg tabs box of 4 @2209@ Cerenia 24 mg tabs box of 4'],
      dtype=object)

In [21]:
text_clf_svm.predict(X_test[[8]])

array(['Cerenia Tablet'], dtype='|S39')

In [22]:
# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, Y_train)



In [21]:
gs_clf.best_score_

0.6890459363957597

In [23]:
X_test[[8]]

array(['17552:Baytril Suspension 22.7mg/mL #888 @6168@ Baytril Suspension 22.7mg/mL #888'],
      dtype=object)

In [22]:
gs_clf.predict(X_test[[8]])

array(['Baytril Injectable Compound'], dtype='|S39')

In [25]:
# Stemming Code

import nltk
# nltk.download() 

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(X_train, Y_train)

predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test)

np.mean(predicted_mnb_stemmed == Y_test)

0.6

In [27]:
X_test[[8]]

array(['17552:Baytril Suspension 22.7mg/mL #888 @6168@ Baytril Suspension 22.7mg/mL #888'],
      dtype=object)

In [26]:
gs_clf.predict(X_test[[8]])

array(['Baytril Injectable Compound'], dtype='|S39')