In [2]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from IPython.display import display
def fit_NB(statements, labels):
    vectorizer=CountVectorizer(ngram_range=(1,2), max_df=0.3)
    clf=MultinomialNB(fit_prior=True, alpha=1.3)
    pipeline_NB= Pipeline(steps=[('vectorizer', vectorizer),('classifier', clf)])
    pipeline_NB.fit(statements,labels)
    score = pipeline_NB.score(statements, labels)
    return pipeline_NB, score






if __name__=='__main__':
    # Read data
    from Dataanalysis import readData_addSentiment, CleanText
    import eli5
    from eli5.lime import TextExplainer
    df=readData_addSentiment()

    
    # Extract relevant data
    mask=df.index.isin(['98','7','57', '96', '103', '207'])
    stmts_train = df["clean_text"][~mask]
    labels_train = df["Sentiment"][~mask]
    print(len(stmts_train))

        
    # Fit models on whole data set
    pip_nb, score_nb = fit_NB(stmts_train, labels_train)
       
    # Interpretation of some sentences
    data_test = {'Sentence' : ["Poor tolerability of the combination is manifest with high rates of discontinuations due to AEs and dose modifications.",
    "This means that further evidence on this medicinal product is awaited",
    "More detailed data on injection site reactions, hypersensitivity and anaphylactic reactions were requested in order to allow a thorough assessment of this issue both in subjects with and without ADAs.",
    "However, precaution is warranted given a small size of safety database, a limited information on long-term toxicity, and a limited data on PK/PD interactions together with indication on potential for worsening toxicity for VEGFRi -mTOR inhibitor combinations in general.",
    "Study 010 will assess in vitro lenvatinib protein binding, determine the unbound drug concentrations in order to define correctly the dose-adjustment in patients with severe hepatic and renal impairment.",
    "Study EFC12404 provides relevant information on the contribution of the mono-components to the effect of the FRC."]}
    
    df_test = pd.DataFrame(data=data_test)
    ct = CleanText()
    stmts_test = ct.fit_transform(df_test['Sentence'])
       
    # Predict on test set
    prediction_1=pip_nb.predict(stmts_test)
    prediction_1_proba=pip_nb.predict_proba(stmts_test)
    print(prediction_1_proba)
        
     
    # Text interpretation
    class_names=['Positive','Negative','Neutral']
    te = TextExplainer(random_state=42)
    te.fit(stmts_test[1], pip_nb.predict_proba)
    display(te.metrics_)
  
    display(te.show_prediction(target_names=class_names))
    print('finished')
    

230
[[1.56292187e-01 7.18619842e-01 1.25087971e-01]
 [4.73634801e-01 4.29486263e-03 5.22070336e-01]
 [2.02110621e-01 2.11220465e-03 7.95777175e-01]
 [4.34357183e-01 5.65610454e-01 3.23635927e-05]
 [3.70307896e-01 1.06205741e-01 5.23486363e-01]
 [7.14698245e-01 5.94337833e-02 2.25867972e-01]]


{'mean_KL_divergence': 0.005519570659449363, 'score': 0.9804327937533083}

Contribution?,Feature,Unnamed: 2_level_0
Contribution?,Feature,Unnamed: 2_level_1
Contribution?,Feature,Unnamed: 2_level_2
+0.610,product,
+0.263,<BIAS>,
+0.192,evid,
+0.082,await,
-0.010,mean evid,
-0.011,product await,
-0.130,evid medicin,
-0.152,mean,
-0.173,medicin,
-0.706,medicin product,

Contribution?,Feature
0.61,product
0.263,<BIAS>
0.192,evid
0.082,await
-0.01,mean evid
-0.011,product await
-0.13,evid medicin
-0.152,mean
-0.173,medicin
-0.706,medicin product

Contribution?,Feature
0.309,mean evid
0.211,evid medicin
-0.272,medicin product
-0.588,await
-0.601,<BIAS>
-0.731,evid
-0.944,mean
-1.456,medicin
-1.854,product

Contribution?,Feature
0.578,medicin product
0.356,medicin
0.23,evid medicin
0.145,mean evid
0.139,product await
0.122,mean
-0.129,await
-0.345,evid
-0.367,product
-0.571,<BIAS>


finished
