# Training DBpedia type classifiers

In [16]:
import pandas as pd
df_dbp = pd.read_csv('dbp_types_en.csv')
df_dbp.head()

Unnamed: 0,person,organisation,location,other,ocr
0,1,0,0,0,Viktor Igorevich Sysoyev (Russian: Виктор Игор...
1,1,0,0,0,Kanlaya Sysomvang (born 3 November 1990) in La...
2,1,0,0,0,Valentin Vasilyevich Sysoyev (Russian: Валенти...
3,1,0,0,0,Alfred Syson (6 April 1880 – 2 August 1952) wa...
4,1,0,0,0,"Dana Syslová (born November 11, 1945) is a Cze..."


In [17]:
types = ['person', 'organisation', 'location', 'other']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_dbp['ocr'], df_dbp[types].as_matrix(), random_state=0)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3000,), (3000, 4), (1000,), (1000, 4))

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

count_vect = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(2,5), analyzer='char_wb', max_features=10000)
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

len(count_vect.vocabulary_.keys())

10000

In [19]:
from sklearn.externals import joblib
joblib.dump(count_vect, 'dbp_types_en_vct.pkl') 

['dbp_types_en_vct.pkl']

In [20]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

clf = OneVsRestClassifier(SVC(probability=True, kernel='linear', class_weight='balanced', C=1.0, verbose=True))
clf.fit(X_train_counts, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM]

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True),
          n_jobs=1)

In [21]:
joblib.dump(clf, 'dbp_types_en_clf.pkl') 

['dbp_types_en_clf.pkl']

In [22]:
pred = clf.predict(X_test_counts)
pred

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       ..., 
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0]])

In [23]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred)

0.95455553338317978

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.89000000000000001

In [25]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

print('average precision', precision_score(y_test, pred, average='macro'))
print('average recall', recall_score(y_test, pred, average='macro'))
print('average f1', f1_score(y_test, pred, average='macro'))

average precision 0.924444276333
average recall 0.934385507579
average f1 0.929249323727


In [26]:
scores = {}
scores['precision'] = precision_score(y_test, pred, average=None)
scores['recall'] = recall_score(y_test, pred, average=None)
scores['f1'] = f1_score(y_test, pred, average=None)

pd.DataFrame(data=scores, index=types)

Unnamed: 0,f1,precision,recall
person,0.962406,0.969697,0.955224
organisation,0.868476,0.870293,0.866667
location,0.964211,0.958159,0.970339
other,0.921905,0.899628,0.945312
