In [78]:
import pandas as pd
# https://www.geeksforgeeks.org/bag-of-word-and-frequency-count-in-text-using-sklearn/
from sklearn.feature_extraction.text import CountVectorizer


In [49]:
impression_raw = pd.read_csv("Final_Impressions.csv")
labels_raw = pd.read_csv("Final_Impressions_labels.csv")

In [75]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stemmer = WordNetLemmatizer()

In [79]:
for i in range(len(impression_raw)):
    words = nltk.word_tokenize(impression_raw.loc[i, "impressions"])
    words = [stemmer.lemmatize(word) for word in words]
    impression_raw.loc[i, "impressions"] = " ".join(words)

In [80]:
impression_raw

Unnamed: 0,impression_id,impressions
0,PE9f3aab,"impress : 1 . probabl , small , subsegment sup..."
1,PE45260c6,impress : 1 . no evid of pulmonari embol . 2 ....
2,PE4527d8b,impress : 1 . no evid of pulmonari embolus or ...
3,PE45284c7,impress : 1 . no pulmonari embol . 2 . redemon...
4,PE9f6f88,impress : 1 . mild to moder atherosclerot calc...
...,...,...
23260,PE45299f7,: 1 . no evid of pulmonari embol . no focal ai...
23261,PE45254fe,impress : 1 . no pulmonari embolus or acut int...
23262,PE9f4698,impress : 1 . pulmonari embolus involv the rig...
23263,PE452a586,impress : 1 . no pulmonari embol . 2 . innumer...


In [81]:
labels_df_1 = labels_raw[['impression_id','pe_acute', 'pe_subsegmentalonly', 'pe_positive']]

In [82]:
# need to obtain all the data that has valid impressions need to remove unusable data
merged_data = pd.merge(impression_raw, labels_df_1, on ="impression_id")
merged_data.head()

Unnamed: 0,impression_id,impressions,pe_acute,pe_subsegmentalonly,pe_positive
0,PE9f3aab,"impress : 1 . probabl , small , subsegment sup...",1,1,1
1,PE45260c6,impress : 1 . no evid of pulmonari embol . 2 ....,0,0,0
2,PE4527d8b,impress : 1 . no evid of pulmonari embolus or ...,0,0,0
3,PE45284c7,impress : 1 . no pulmonari embol . 2 . redemon...,0,0,0
4,PE9f6f88,impress : 1 . mild to moder atherosclerot calc...,0,0,0


In [83]:
vectorizer = CountVectorizer(lowercase = True, stop_words = "english", max_features = 1000)
x = vectorizer.fit_transform(merged_data['impressions'])
y = merged_data[['pe_acute', 'pe_subsegmentalonly', 'pe_positive']]

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 0)

In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

y_preds = model.predict(X_test)

model.score(X_test, y_test)

0.9094034725803679

In [86]:
# taking too much time to run
# from sklearn.svm import SVC
# model_1 = OneVsRestClassifier(SVC(max_iter=1000))
# model_1.fit(X_train, y_train)
# model_1.score(X_test, y_test)

In [87]:
from sklearn.tree import DecisionTreeClassifier
model_2 = OneVsRestClassifier(DecisionTreeClassifier(max_depth=6))
model_2.fit(X_train, y_train)
model_2.score(X_test, y_test)

0.8892900120336944

In [46]:
from sklearn.neighbors import KNeighborsClassifier
model_3 = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=10))
model_3.fit(X_train, y_train)
model_3.score(X_test, y_test)


KeyboardInterrupt



In [70]:
from sklearn.metrics  import classification_report
print("Classification Report:\n")
print(classification_report(y_test, y_preds, target_names=['pe_acute', 'pe_subsegmentalonly', 'pe_positive']))

Classification Report:

                     precision    recall  f1-score   support

           pe_acute       0.88      0.83      0.85      1035
pe_subsegmentalonly       0.70      0.57      0.63       247
        pe_positive       0.89      0.86      0.87      1164

          micro avg       0.87      0.81      0.84      2446
          macro avg       0.82      0.75      0.78      2446
       weighted avg       0.87      0.81      0.84      2446
        samples avg       0.17      0.16      0.16      2446



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [88]:
non_zero_data = merged_data[(merged_data[['pe_acute', 'pe_subsegmentalonly', 'pe_positive']].sum(axis=1)) > 0]

In [89]:
non_zero_data.describe()

Unnamed: 0,pe_acute,pe_subsegmentalonly,pe_positive
count,4701.0,4701.0,4701.0
mean,0.866624,0.205063,0.998086
std,0.340017,0.40379,0.043718
min,0.0,0.0,0.0
25%,1.0,0.0,1.0
50%,1.0,0.0,1.0
75%,1.0,0.0,1.0
max,1.0,1.0,1.0


In [90]:
# testing data with all of non-zero outputs
X_non_zero = vectorizer.transform(non_zero_data["impressions"])
y_preds_non_zero = non_zero_data[['pe_acute', 'pe_subsegmentalonly', 'pe_positive']]

model.score(X_non_zero, y_preds_non_zero)

0.7409061901723037

In [74]:
model_2.score(X_non_zero, y_preds_non_zero)

0.6719846841097639

In [75]:
model_3.score(X_non_zero, y_preds_non_zero)

0.5675388215273346