In [71]:
import pandas as pd
import numpy as np
import numpy as np
from tqdm import tqdm
import sqlalchemy
from sqlalchemy import text
from pandarallel import pandarallel
import regex as re
pandarallel.initialize(progress_bar=True)
tqdm.pandas()
import pprint as pr

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [102]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn import metrics

In [3]:
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:4750@192.168.0.137:5432/ba1')

In [5]:
def sql_read(topics,lim):
    stat= "SELECT record_id, collectiontitle_token,abstract_token,title_token,class FROM  ke_stage.ba_corpus_2 WHERE class LIKE "+str(topics)+" LIMIT " +str(lim)
    df = pd.read_sql_query(sqlalchemy.text(str(stat)),engine)
    return df

In [32]:
lim=500000
df_med = sql_read("'Medizin'",lim)                 
df_land = sql_read("'Landwirtschaft'",lim)          
df_umwelt = sql_read("'Umweltwissenschaften'",lim)     
df_ern = sql_read("'ErnÃ¤hrung'",lim)     
#df_rest = sql_read("'Rest'",lim)
df = pd.concat([df_med, df_land,df_umwelt,df_ern])

In [33]:
#df_dataset = df['title_token'] + df['abstract_token'] + df['collectiontitle_token']
df['combined'] = df[df.columns[1:3]].parallel_apply(lambda x: ','.join(x.astype(str)),axis=1)
df = df.drop(['title_token',
              'abstract_token','collectiontitle_token'
              ],axis =1 )


In [39]:
#df_test,df_train = np.split(df, [int(.25*len(df))])
df_train, df_test = train_test_split(df, test_size=0.25)

In [40]:
print("df")
print(df['class'].value_counts())
print("df_test")
print(df_test['class'].value_counts())
print("df_train")
print(df_train['class'].value_counts())

df
Medizin                 500000
Landwirtschaft          500000
Umweltwissenschaften    500000
ErnÃ¤hrung              500000
Name: class, dtype: int64
df_test
Umweltwissenschaften    125427
Medizin                 125301
ErnÃ¤hrung              124831
Landwirtschaft          124441
Name: class, dtype: int64
df_train
Landwirtschaft          375559
ErnÃ¤hrung              375169
Medizin                 374699
Umweltwissenschaften    374573
Name: class, dtype: int64


In [41]:
count_vect = CountVectorizer(lowercase=False,stop_words=None)
X_train_counts = count_vect.fit_transform(df_train.combined)
X_train_counts.shape

(1500000, 852256)

In [42]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1500000, 852256)

In [43]:
X_new_counts = count_vect.transform(df_train['combined'])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [44]:
clf = MultinomialNB().fit(X_train_tfidf, df_train['class'])

In [45]:
predicted = clf.predict(X_new_tfidf)

In [47]:
np.mean(predicted == df_train['class'])

0.5402973333333333

In [94]:
text_clf = Pipeline([
     ('vect', CountVectorizer(lowercase=False,stop_words=None,tokenizer=None,min_df=3)),
     ('tfidf', TfidfTransformer(use_idf=True)),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

In [95]:
text_clf.fit(df_train['combined'], df_train['class'])

In [96]:
text_clf.get_params(deep=True)


{'memory': None,
 'steps': [('vect', CountVectorizer(lowercase=False, min_df=3)),
  ('tfidf', TfidfTransformer()),
  ('clf', SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None))],
 'verbose': False,
 'vect': CountVectorizer(lowercase=False, min_df=3),
 'tfidf': TfidfTransformer(),
 'clf': SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': False,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 3,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__alpha': 0.001,
 'clf__average': Fals

In [97]:
predicted = text_clf.predict(df_test['combined'])

In [98]:
np.mean(predicted == df_test['class'])

0.503194

In [99]:
print(multilabel_confusion_matrix(df_test['class'], predicted, labels=['Medizin', 'Landwirtschaft', 
                                                                       'Umweltwissenschaften','ErnÃ¤hrung']))

[[[187441 187258]
  [ 23080 102221]]

 [[370692   4867]
  [104666  19775]]

 [[361070  13503]
  [ 79972  45455]]

 [[332394  42775]
  [ 40685  84146]]]


In [100]:
print(f1_score(df_test['class'], predicted, average="macro"))   
print(f1_score(df_test['class'], predicted, average="micro"))   


0.4799271988856988
0.503194


In [101]:
print(precision_recall_fscore_support(df_test['class'], predicted, average='macro',))
print(precision_recall_fscore_support(df_test['class'], predicted, average='micro',))

(0.6473910443843156, 0.5027988958553017, 0.4799271988856988, None)
(0.503194, 0.503194, 0.503194, None)


In [104]:
print(metrics.classification_report(df_test['class'], predicted,
   target_names=['Medizin', 'Landwirtschaft','Umweltwissenschaften','ErnÃ¤hrung']))

                      precision    recall  f1-score   support

             Medizin       0.66      0.67      0.67    124831
      Landwirtschaft       0.80      0.16      0.27    124441
Umweltwissenschaften       0.35      0.82      0.49    125301
          ErnÃ¤hrung       0.77      0.36      0.49    125427

            accuracy                           0.50    500000
           macro avg       0.65      0.50      0.48    500000
        weighted avg       0.65      0.50      0.48    500000

