In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from datasets import Dataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
warnings.filterwarnings("ignore", category=ConvergenceWarning, module='sklearn')

# Load the dataset
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")

# Convert to Pandas DataFrame for analysis
df = dataset['train'].to_pandas()

# Briefly describe the dataset
print(f"Dataset contains {len(df)} documents.")
print(df.head())

# Tokenize the 'output' column to find frequent terms
output_text = ''.join(df['output'].astype(str).tolist())
tokens = output_text.lower().split()
common_terms = [term for term, count in Counter(tokens).most_common(50)]
print(f"Most common terms: {common_terms}")

# Define relevant common medical terms based on frequent terms analysis
relevant_terms = ['treatment', 'symptoms', 'patient', 'blood', 'cause', 'condition', 'risk', 'cells', 'patients', 'associated']

# Check presence of relevant terms in the dataset
present_terms = [term for term in relevant_terms if df['output'].str.contains(term, case=False, na=False).any()]
print(f"Present terms: {present_terms}")

# Filter the dataset to only include these present relevant terms
filtered_df = df[df['output'].str.contains('|'.join(present_terms), case=False, na=False)]

# Update X and y after filtering
X = filtered_df['input']
y = filtered_df['output']

# Ensure that y only contains our relevant terms
y = y.apply(lambda ans: next((term for term in present_terms if term in ans.lower()), 'other'))

# Print the class distribution before further filtering
print("Class distribution before filtering 'other':")
print(y.value_counts())

# Check the final class distribution to ensure the dataset is ready for training
print("Final class distribution:")
print(y.value_counts())

# Encode the labels using LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define the pipeline with TF-IDF vectorization and Logistic Regression
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')),
    ('clf', CalibratedClassifierCV(LogisticRegression(max_iter=5000, class_weight='balanced'), cv=5, method='sigmoid'))
])

# Define hyperparameter grid for the pipeline
param_grid = {
    'vectorizer__max_features': [3000, 5000, 7000],
    'clf__estimator__C': [0.1, 1, 10],
    'clf__estimator__penalty': ['l2'],
    'clf__estimator__solver': ['lbfgs', 'liblinear'],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Evaluate the best model on the test set
y_pred = grid_search.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy: {accuracy:.2f}")
print(f"F1-score: {f1:.2f}")

# Detailed evaluation
unique_labels = np.unique(y_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=unique_labels)
plt.figure(figsize=(12, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix for Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Try other models, like Random Forest and SVM
rf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

svm_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')),
    ('clf', CalibratedClassifierCV(SVC(kernel='linear', class_weight='balanced'), cv=5, method='sigmoid'))
])

# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'vectorizer__max_features': [3000, 5000, 7000],
    'clf__n_estimators': [10, 50, 100],
    'clf__max_depth': [None, 5, 10],
    'clf__min_samples_split': [2, 5, 10],
}

# Define hyperparameter grid for SVM
svm_param_grid = {
    'vectorizer__max_features': [3000, 5000, 7000],
    'clf__estimator__C': [0.1, 1, 10],
    'clf__estimator__kernel': ['linear', 'rbf'],
}

rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

rf_grid_search.fit(X_train, y_train)
svm_grid_search.fit(X_train, y_train)

print("Random Forest Best hyperparameters:", rf_grid_search.best_params_)
print("Random Forest Best score:", rf_grid_search.best_score_)

print("SVM Best hyperparameters:", svm_grid_search.best_params_)
print("SVM Best score:", svm_grid_search.best_score_)

# Evaluate Random Forest on the test set
rf_y_pred = rf_grid_search.best_estimator_.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred, average='macro')
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")
print(f"Random Forest F1-score: {rf_f1:.2f}")

# Evaluate SVM on the test set
svm_y_pred = svm_grid_search.best_estimator_.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_f1 = f1_score(y_test, svm_y_pred, average='macro')
print(f"SVM Accuracy: {svm_accuracy:.2f}")
print(f"SVM F1-score: {svm_f1:.2f}")

# Detailed evaluation for Random Forest
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred, target_names=le.classes_))

# Confusion Matrix for Random Forest
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred, labels=unique_labels)
plt.figure(figsize=(12, 8))
sns.heatmap(rf_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix for Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Detailed evaluation for SVM
print("SVM Classification Report:")
print(classification_report(y_test, svm_y_pred, target_names=le.classes_))

# Confusion Matrix for SVM
svm_conf_matrix = confusion_matrix(y_test, svm_y_pred, labels=unique_labels)
plt.figure(figsize=(12, 8))
sns.heatmap(svm_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix for SVM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
