In [1]:
import sklearn

In [19]:
categories = ['Case_Study', 
              'Editorial',
              'Letter',
              'Commentary',
              'Review',
              'Clinical_Trial',
              'Randomized_Controlled_Trial',
              'Research',
              'Systematic_Review']

In [20]:
from sklearn.datasets import load_files
descriptors = sklearn.datasets.load_files('C:\Users\dclynch\Desktop\MTI_CIN\Pub_Type_Classifier\pub_types', description=None, categories=categories, load_content=True, shuffle=True, encoding='utf-8', decode_error='strict', random_state=0)

In [21]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer(stop_words=stopwords.words('english'))
tfidf_transformer = TfidfTransformer()

# tokenizing build a dictionary of features and transform documents to feature vectors
X_train_counts = count_vect.fit_transform(descriptors.data)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2560, 21420)

In [22]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2',
                                         alpha=1e-3, n_iter=5,
                                         random_state=42).fit(X_train_tfidf, descriptors.target)

In [23]:
docs_new = ['A letter to editor is presented', 'case report of year old patient presented with symptoms', 'this study investigates']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, descriptors.target_names[category]))

'A letter to editor is presented' => Letter
'case report of year old patient presented with symptoms' => Case_Study
'this study investigates' => Research


In [24]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
])

In [25]:
text_clf = text_clf.fit(descriptors.data, descriptors.target)

In [33]:
import numpy as np
descriptors_test = sklearn.datasets.load_files('C:\Users\dclynch\Desktop\MTI_CIN\Pub_Type_Classifier\pub_types_test', description=None, categories=categories, load_content=True, shuffle=True, encoding='utf-8', decode_error='strict', random_state=0)
docs_test = descriptors_test.data


In [34]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == descriptors_test.target)

0.74131274131274127

In [36]:
from sklearn import metrics
print(metrics.classification_report(descriptors_test.target, predicted,
    target_names=descriptors_test.target_names))


print "Confusion matrix:"
metrics.confusion_matrix(descriptors_test.target, predicted)


                             precision    recall  f1-score   support

                 Case_Study       0.68      0.53      0.59        40
             Clinical_Trial       0.12      0.24      0.16        29
                 Commentary       0.25      0.17      0.20        35
                  Editorial       0.91      0.88      0.89       233
                     Letter       0.56      0.41      0.47        22
Randomized_Controlled_Trial       0.79      0.75      0.77       195
                   Research       0.79      0.76      0.77       227
                     Review       0.28      0.68      0.40        28
          Systematic_Review       0.87      0.81      0.84       227

                avg / total       0.77      0.74      0.75      1036

Confusion matrix:


array([[ 21,   1,   0,   2,   3,   1,   3,   6,   3],
       [  0,   7,   0,   0,   0,  13,   8,   0,   1],
       [  2,   0,   6,   9,   0,   0,   4,  12,   2],
       [  0,   2,  13, 204,   2,   0,   5,   5,   2],
       [  3,   1,   1,   5,   9,   0,   0,   3,   0],
       [  0,  23,   0,   1,   0, 147,  13,   2,   9],
       [  3,  16,   1,   2,   0,  23, 172,   2,   8],
       [  0,   0,   2,   1,   1,   0,   2,  19,   3],
       [  2,   7,   1,   0,   1,   2,  12,  19, 183]])