In [None]:
import sys
print(sys.executable)
import re
# !pip install trl

In [None]:
import numpy as no
import FalconTrainer as ft
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, T5ForConditionalGeneration

In [None]:
data = pd.read_excel("data3.xlsx")
data.head()

## Statistics

### Age

In [None]:
# Function to convert age to years
def convert_age_to_years(age_str):
    matches = re.match(r"(\d+)([ym])", age_str)
    if matches:
        age, unit = matches.groups()
        age = int(age)
        if unit == 'm':  # Convert months to years
            age /= 12
        return age
    else:
        return None

# Convert all ages to years
data['Age_in_Years'] = data['Pt_Age'].apply(convert_age_to_years)

# Calculate mean and standard deviation
mean_age = data['Age_in_Years'].mean()
std_age = data['Age_in_Years'].std()
print(f"Mean age: {mean_age:.2f} years")
print(f"Standard deviation of age: {std_age:.2f} years")

### Gender

In [None]:
data.Pt_Gender.value_counts(normalize=True).round(4)

### Report length

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

# Ensure you have the necessary NLTK data
nltk.download('punkt')

In [None]:
def report_lengths(reports):
    sentence_counts = [len(sent_tokenize(report)) for report in reports]
    word_counts = [len(word_tokenize(report)) for report in reports]
    return pd.Series({
        'mean_sentence_count': pd.Series(sentence_counts).mean(),
        'std_sentence_count': pd.Series(sentence_counts).std(),
        'mean_word_count': pd.Series(word_counts).mean(),
        'std_word_count': pd.Series(word_counts).std()
    })

# Calculate for Radiology Report
rad_report_lengths = report_lengths(data['Rad_Report'])

# Calculate for Operative Report
op_report_lengths = report_lengths(data['Op_Report'].astype(str))

print("Radiology Report Lengths (mean±std):")
print("Sentence:", rad_report_lengths[['mean_sentence_count', 'std_sentence_count']].round(2))
print("Word:", rad_report_lengths[['mean_word_count', 'std_word_count']].round(2))

print("\nOperative Report Lengths (mean±std):")
print("Sentence:", op_report_lengths[['mean_sentence_count', 'std_sentence_count']].round(2))
print("Word:", op_report_lengths[['mean_word_count', 'std_word_count']].round(2))


### Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

# cosine similarity between data['Rad_Report'] and data['Op_Report']
tfidf = CountVectorizer()
rad_tfidf = tfidf.fit_transform(data['Rad_Report'].astype(str))
op_tfidf = tfidf.transform(data['Op_Report'].astype(str))
cosine_sim = cosine_similarity(rad_tfidf, op_tfidf)
cosine_sim = pd.Series(cosine_sim.diagonal(), index=data.index)
cosine_sim.describe()

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


# Load pre-trained model tokenizer (vocabulary) and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to encode text to BERT embeddings
def encode_text_to_bert_embeddings(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()

# Vectorize the reports
data['Rad_Report_Vector'] = data['Rad_Report'].apply(encode_text_to_bert_embeddings)
data['Op_Report_Vector'] = data['Op_Report'].apply(encode_text_to_bert_embeddings)

# Calculate Cosine Similarity
data['Cosine_Similarity'] = data.apply(lambda row: cosine_similarity([row['Rad_Report_Vector'][0]], [row['Op_Report_Vector'][0]])[0][0], axis=1)

print(data[['Cosine_Similarity']])


### Visualize

In [None]:
data = pd.read_excel("data3.xlsx")

df=ft.clean_data(data, False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA
# Assuming 'df' is your dataframe and is already loaded with data
structures = ['Supraspinatus', 'Infraspinatus', 'Subscapularis', 'Biceps', 'Labrum']

df=df[["Rad_Supraspinatus", "Op_Supraspinatus", "Rad_Infraspinatus", "Op_Infraspinatus", "Rad_Subscapularis", "Op_Subscapularis", "Rad_Biceps", "Op_Biceps", "Rad_Labrum", "Op_Labrum"]].dropna()
cols=df.columns

tfidf = TfidfVectorizer()
all_text = []
for structure in structures:
    rad_col = f'Rad_{structure}'
    op_col = f'Op_{structure}'
    all_text.extend(df[rad_col].tolist())
    all_text.extend(df[op_col].tolist())
    

all_embeddings=tfidf.fit_transform(all_text).toarray()

In [None]:
df.shape

In [None]:
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings_tsne = tsne.fit_transform(all_embeddings) 

In [None]:

n=(int)(reduced_embeddings_tsne.shape[0]/10)
# Step 4: Visualization using t-SNE embeddings
fig, axs = plt.subplots(1, 5, figsize=(11, 3))

legend_labels = [] 
start = 0
for i, structure in enumerate(structures):
    # Split the embeddings into 'Rad' and 'Op'
    rad_embeddings = reduced_embeddings_tsne[start:start+n, :2]
    start=start+n
    op_embeddings = reduced_embeddings_tsne[start:start+n, :2]
    start=start+n

    print(rad_embeddings.shape)
    
    # Plot 'Rad' and 'Op' embeddings with a box around each subplot
    axs[i].scatter(rad_embeddings[:, 0], rad_embeddings[:, 1], color='red')
    axs[i].scatter(op_embeddings[:, 0], op_embeddings[:, 1], color='blue')
    axs[i].set_title(structure , fontsize=12 , y=-0.16)
    
    
    # remove borders
    axs[i].axis('off') 
    
    
    # Add labels for legend
legend_labels.append(f'Radiology report')
legend_labels.append(f'Operative report')

# Create a single legend for the entire figure
fig.legend(legend_labels, loc='upper left', fontsize=12, bbox_to_anchor=(0, 1.13))

plt.tight_layout()
plt.show()
fig.savefig('embeddings.png', dpi=300, bbox_inches='tight')