In [1]:
import pandas as pd
# https://www.geeksforgeeks.org/bag-of-word-and-frequency-count-in-text-using-sklearn/
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
impression_raw = pd.read_csv("Final_Impressions.csv")
labels_raw = pd.read_csv("Final_Impressions_labels.csv")

In [3]:
impression_raw["impressions"]

0        IMPRESSION: 1. PROBABLE, SMALL, SUBSEGMENTAL S...
1        IMPRESSION: 1.  No evidence of pulmonary embol...
2        IMPRESSION: 1.  No evidence of pulmonary embol...
3        IMPRESSION: 1.  No pulmonary embolism. 2.  Red...
4        IMPRESSION:   1.  MILD TO MODERATE ATHEROSCLER...
                               ...                        
23260    :\n1.  No evidence of pulmonary embolism. No f...
23261    IMPRESSION:\n1.  No pulmonary embolus or acute...
23262    IMPRESSION:\n1. PULMONARY EMBOLUS INVOLVING TH...
23263    IMPRESSION:\n1.  No pulmonary embolism.\n2.  I...
23264    IMPRESSION:\n1.  No pulmonary embolism.\n2.  F...
Name: impressions, Length: 23265, dtype: object

In [4]:
labels_df_1 = labels_raw[['impression_id','pe_acute', 'pe_subsegmentalonly', 'pe_positive']]

In [5]:
# need to obtain all the data that has valid impressions need to remove unusable data
merged_data = pd.merge(impression_raw, labels_df_1, on ="impression_id")
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23265 entries, 0 to 23264
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   impression_id        23265 non-null  object
 1   impressions          23265 non-null  object
 2   pe_acute             23265 non-null  int64 
 3   pe_subsegmentalonly  23265 non-null  int64 
 4   pe_positive          23265 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 908.9+ KB


In [9]:
vectorizer = CountVectorizer(lowercase = True, stop_words = "english", max_features = 1000)
x = vectorizer.fit_transform(merged_data['impressions'])
y = merged_data[['pe_acute', 'pe_subsegmentalonly', 'pe_positive']]

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 0)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

y_preds = model.predict(X_test)

model.score(X_test, y_test)

0.913013580883617

In [71]:
# taking too much time to run
# from sklearn.svm import SVC
# model_1 = OneVsRestClassifier(SVC(max_iter=1000))
# model_1.fit(X_train, y_train)
# model_1.score(X_test, y_test)

In [69]:
from sklearn.tree import DecisionTreeClassifier
model_2 = OneVsRestClassifier(DecisionTreeClassifier(max_depth=6))
model_2.fit(X_train, y_train)
model_2.score(X_test, y_test)

0.8951349492865739

In [72]:
from sklearn.neighbors import KNeighborsClassifier
model_3 = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=10))
model_3.fit(X_train, y_train)
model_3.score(X_test, y_test)

0.8793192367199587

In [37]:
from sklearn.metrics  import classification_report
print("Classification Report:\n")
print(classification_report(y_test, y_preds, target_names=['pe_acute', 'pe_subsegmentalonly', 'pe_positive']))

Classification Report:

                     precision    recall  f1-score   support

           pe_acute       0.88      0.84      0.86      1035
pe_subsegmentalonly       0.73      0.67      0.70       247
        pe_positive       0.88      0.86      0.87      1164

          micro avg       0.87      0.83      0.85      2446
          macro avg       0.83      0.79      0.81      2446
       weighted avg       0.87      0.83      0.85      2446
        samples avg       0.17      0.17      0.17      2446



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
non_zero_data = merged_data[(merged_data[['pe_acute', 'pe_subsegmentalonly', 'pe_positive']].sum(axis=1)) > 0]

In [59]:
non_zero_data.describe()

Unnamed: 0,pe_acute,pe_subsegmentalonly,pe_positive
count,4701.0,4701.0,4701.0
mean,0.866624,0.205063,0.998086
std,0.340017,0.40379,0.043718
min,0.0,0.0,0.0
25%,1.0,0.0,1.0
50%,1.0,0.0,1.0
75%,1.0,0.0,1.0
max,1.0,1.0,1.0


In [63]:
# testing data with all of non-zero outputs
X_non_zero = vectorizer.transform(non_zero_data["impressions"])
y_preds_non_zero = non_zero_data[['pe_acute', 'pe_subsegmentalonly', 'pe_positive']]

model.score(X_non_zero, y_preds_non_zero)

0.7704743671559243

In [74]:
model_2.score(X_non_zero, y_preds_non_zero)

0.6719846841097639

In [75]:
model_3.score(X_non_zero, y_preds_non_zero)

0.5675388215273346