In [None]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/'

Mounted at /content/gdrive


In [None]:
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model

In [None]:
# Load pre-trained VGG16 model without top (fully connected) layers
base_model = VGG16(weights='imagenet', include_top=False)

# Define the intermediate layer from which to extract features
feature_extractor = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
# Function to extract features from image using VGG16
def extract_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = feature_extractor.predict(x)
    return features.flatten()

# Example usage of feature extraction
features = extract_features('gdrive/My Drive/images_final/images_train/IMG_ENC00854_00001.jpg')



In [None]:
import joblib


In [None]:
import pandas as pd
import os
from skimage.transform import resize
from skimage.io import imread
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [None]:
import pandas as pd
data1 = pd.read_csv('gdrive/My Drive/mediqa-m3-clinicalnlp2024/trainingCorpusM3G.csv')#, nrows=20)

data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   encounter_id      898 non-null    object
 1   image_ids         898 non-null    object
 2   query_title_en    875 non-null    object
 3   query_content_en  895 non-null    object
 4   author_id         898 non-null    object
 5   content_en        896 non-null    object
 6   content_zh        898 non-null    object
 7   content_es        896 non-null    object
dtypes: object(8)
memory usage: 56.2+ KB


# **M3G-VGG16-CNN-SVM**

In [None]:
data1.dropna(subset=['content_zh'], inplace=True)

data1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   encounter_id      898 non-null    object
 1   image_ids         898 non-null    object
 2   query_title_en    875 non-null    object
 3   query_content_en  895 non-null    object
 4   author_id         898 non-null    object
 5   content_en        896 non-null    object
 6   content_zh        898 non-null    object
 7   content_es        896 non-null    object
dtypes: object(8)
memory usage: 56.2+ KB


In [None]:
train_data = data1[['image_ids', 'content_zh']]
train_data

Unnamed: 0,image_ids,content_zh
0,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...",是继发性白斑。多为暂时的，一般不需治疗，经半年至一年左右，可自行恢复。
1,"['IMG_ENC00853_00001.jpg', 'IMG_ENC00853_00002...",应该是双侧发病，考虑湿疹
2,"['IMG_ENC00854_00001.jpg', 'IMG_ENC00854_00002...",第一附图考虑手癣，下面是剥脱性角质松解症
3,"['IMG_ENC00855_00001.jpg', 'IMG_ENC00855_00002...",带状疱疹
4,['IMG_ENC00856_00001.jpg'],湿疹，可以外用康复新液，口服甘草酸
...,...,...
893,['IMG_ENC00847_00001.jpg'],怀疑肿瘤
894,"['IMG_ENC00848_00001.jpg', 'IMG_ENC00848_00002...",建议病理检查\n除外MF
895,"['IMG_ENC00849_00001.jpg', 'IMG_ENC00849_00002...",银屑病
896,['IMG_ENC00850_00001.jpg'],鱼鳞病，可以临床缓解


In [None]:
import pandas as pd

df = train_data

# Preprocessing
new_data = {'image_ids': [], 'content_zh': []}
for idx, row in df.iterrows():
    image_ids = row['image_ids'].strip("[]").replace("'", "").split(", ")
    for image_id in image_ids:
        new_data['image_ids'].append(image_id.strip())
        new_data['content_zh'].append(row['content_zh'])

new_df = pd.DataFrame(new_data)

print(new_df)


                   image_ids  \
0     IMG_ENC00852_00001.jpg   
1     IMG_ENC00852_00002.jpg   
2     IMG_ENC00853_00001.jpg   
3     IMG_ENC00853_00002.jpg   
4     IMG_ENC00854_00001.jpg   
...                      ...   
2625  IMG_ENC00850_00001.jpg   
2626  IMG_ENC00851_00001.jpg   
2627  IMG_ENC00851_00002.jpg   
2628  IMG_ENC00851_00003.jpg   
2629  IMG_ENC00851_00004.jpg   

                                             content_zh  
0                   是继发性白斑。多为暂时的，一般不需治疗，经半年至一年左右，可自行恢复。  
1                   是继发性白斑。多为暂时的，一般不需治疗，经半年至一年左右，可自行恢复。  
2                                          应该是双侧发病，考虑湿疹  
3                                          应该是双侧发病，考虑湿疹  
4                                  第一附图考虑手癣，下面是剥脱性角质松解症  
...                                                 ...  
2625                                         鱼鳞病，可以临床缓解  
2626  现在这种情况应该l口服抗过敏药，钙剂和维生素C，如果全身泛发酌情使用免疫调节药物。皮肤干燥平...  
2627  现在这种情况应该l口服抗过敏药，钙剂和维生素C，如果全身泛发酌情使用免疫调节药物。皮肤干燥平...  
2628  现在这种情况应该l口服抗过敏药，钙剂和维生素C，如果全身泛

In [None]:
new_df.content_zh.nunique(dropna=True)


810

In [None]:
import os
df=new_df
# Data directory
data_dir = 'gdrive/My Drive/images_final/images_train'  # Directory containing your images

# Load images and labels
X = []
y = []

for index, row in df.iterrows():
    image_path = os.path.join(data_dir, row['image_ids'])
    X.append(image_path)
    y.append(row['content_zh'])

In [None]:
from tensorflow.keras.preprocessing import image
import numpy as np

# Assuming X contains image paths and y contains captions
X_train_features = []
for img_path in X:
    # Load image using Keras load_img function
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    # Extract features using the feature extractor model
    features = feature_extractor.predict(x)
    X_train_features.append(features.flatten())

# Convert the list of features to numpy array
X_train_features = np.array(X_train_features)
y_train = np.array(y)




In [None]:
# Train SVM
clf = svm.SVC(kernel='linear')
clf.fit(X_train_features, y_train)





In [None]:
import joblib
# Save the trained model to a file
joblib.dump(clf, 'gdrive/My Drive/CNNsvm_model-zh.pkl')

['gdrive/My Drive/CNNsvm_model-zh.pkl']

In [None]:
from tensorflow.keras.preprocessing import image
import numpy as np

# Assuming X contains image paths and y contains captions
X_test_features = []
for img_path in X:
    # Load image using Keras load_img function
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    # Extract features using the feature extractor model
    features = feature_extractor.predict(x)
    X_test_features.append(features.flatten())

# Convert the list of features to numpy array
X_test_features = np.array(X_test_features)
y_test = np.array(y)




In [None]:
import joblib
from sklearn.metrics import classification_report

# Load the saved model from file
clf = joblib.load('gdrive/My Drive/CNNsvm_model.pkl')

# Assuming X_test_features is your test data features and y_test is your test labels
# Use the loaded model to make predictions on the test data
test_predictions = clf.predict(X_test_features)

# Evaluate the predictions
print(classification_report(y_test, test_predictions))


                                                                                                                                                                                                                                                                                                                    precision    recall  f1-score   support

                                                                                                                    Based on the picture, it is urticaria.  Treatment: Yupingfeng granules (A traditional Chinese medicine that nourishes the bodiy and improves immunity), Desloratadine Citrate Disodium tablet.       1.00      1.00      1.00         1
                              Based the shape, form and locations of distribution, it looks like eczema.  Use topically Eloson.  Avoid scratching if possible.  May use medical skin lotions to nourish and revitalize the skin barrier.  If no obvious improvement, make it a medical case for further diagnos

# **Chinese SVM**

In [None]:
train_data = data1[['image_ids', 'content_zh']]
train_data

Unnamed: 0,image_ids,content_zh
0,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...",是继发性白斑。多为暂时的，一般不需治疗，经半年至一年左右，可自行恢复。
1,"['IMG_ENC00853_00001.jpg', 'IMG_ENC00853_00002...",应该是双侧发病，考虑湿疹
2,"['IMG_ENC00854_00001.jpg', 'IMG_ENC00854_00002...",第一附图考虑手癣，下面是剥脱性角质松解症
3,"['IMG_ENC00855_00001.jpg', 'IMG_ENC00855_00002...",带状疱疹
4,['IMG_ENC00856_00001.jpg'],湿疹，可以外用康复新液，口服甘草酸
...,...,...
893,['IMG_ENC00847_00001.jpg'],怀疑肿瘤
894,"['IMG_ENC00848_00001.jpg', 'IMG_ENC00848_00002...",建议病理检查\n除外MF
895,"['IMG_ENC00849_00001.jpg', 'IMG_ENC00849_00002...",银屑病
896,['IMG_ENC00850_00001.jpg'],鱼鳞病，可以临床缓解


In [None]:
import pandas as pd

df = train_data

# Preprocessing
new_data = {'image_ids': [], 'content_zh': []}
for idx, row in df.iterrows():
    image_ids = row['image_ids'].strip("[]").replace("'", "").split(", ")
    for image_id in image_ids:
        new_data['image_ids'].append(image_id.strip())
        new_data['content_zh'].append(row['content_zh'])

new_df = pd.DataFrame(new_data)

print(new_df)


                   image_ids  \
0     IMG_ENC00852_00001.jpg   
1     IMG_ENC00852_00002.jpg   
2     IMG_ENC00853_00001.jpg   
3     IMG_ENC00853_00002.jpg   
4     IMG_ENC00854_00001.jpg   
...                      ...   
2625  IMG_ENC00850_00001.jpg   
2626  IMG_ENC00851_00001.jpg   
2627  IMG_ENC00851_00002.jpg   
2628  IMG_ENC00851_00003.jpg   
2629  IMG_ENC00851_00004.jpg   

                                             content_zh  
0                   是继发性白斑。多为暂时的，一般不需治疗，经半年至一年左右，可自行恢复。  
1                   是继发性白斑。多为暂时的，一般不需治疗，经半年至一年左右，可自行恢复。  
2                                          应该是双侧发病，考虑湿疹  
3                                          应该是双侧发病，考虑湿疹  
4                                  第一附图考虑手癣，下面是剥脱性角质松解症  
...                                                 ...  
2625                                         鱼鳞病，可以临床缓解  
2626  现在这种情况应该l口服抗过敏药，钙剂和维生素C，如果全身泛发酌情使用免疫调节药物。皮肤干燥平...  
2627  现在这种情况应该l口服抗过敏药，钙剂和维生素C，如果全身泛发酌情使用免疫调节药物。皮肤干燥平...  
2628  现在这种情况应该l口服抗过敏药，钙剂和维生素C，如果全身泛

In [None]:
new_df.content_zh.nunique(dropna=True)


810

In [None]:
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from skimage.io import imread
from skimage.transform import resize
'''
# Load CSV file containing image IDs and labels
csv_file_path = 'path_to_your_csv_file.csv'
df = pd.read_csv(csv_file_path)
'''
df=new_df
# Data directory
data_dir = 'gdrive/My Drive/images_final/images_train'  # Directory containing your images

# Load images and labels
X = []
y = []

for index, row in df.iterrows():
    image_path = os.path.join(data_dir, row['image_ids'])
    try:
        image = imread(image_path)
        image = resize(image, (100, 100))  # Resize image to reduce computation time
        X.append(image.flatten())  # Flatten image and append to features
        y.append(row['content_zh'])  # Append label
    except Exception as e:
        print(f"Error loading image {row['image_ids']}: {e}")

# Define and train the SVM model
model = SVC(kernel='linear')
model.fit(X, y)




In [None]:
import os
import pickle


In [None]:
# Save the trained SVM model to a file
model_file_path = 'gdrive/My Drive/svm_modelM3G-zh.pkl'
with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)

print("Model trained and saved successfully.")

In [None]:
import os
import pickle
from skimage.io import imread
from skimage.transform import resize

# Load the saved SVM model from the file
model_file_path = 'gdrive/My Drive/svm_modelM3G-zh.pkl'
with open(model_file_path, 'rb') as file:
    loaded_model = pickle.load(file)

# Load the image you want to make predictions on
image_path = 'gdrive/My Drive/images_final/images_train/IMG_ENC00852_00002.jpg'  # Replace with the path to your image
try:
    image = imread(image_path)
    image = resize(image, (100, 100))  # Resize image to match the size used during training
except Exception as e:
    print(f"Error loading image: {e}")
    exit()

# Flatten the image and make predictions
flattened_image = image.flatten().reshape(1, -1)  # Reshape to 1 sample
predicted_class = loaded_model.predict(flattened_image)

print("Predicted class:", predicted_class[0])


# **Spanish-SVM**

In [None]:
data1.dropna(subset=['content_es'], inplace=True)

data1.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 896 entries, 0 to 897
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   encounter_id      896 non-null    object
 1   image_ids         896 non-null    object
 2   query_title_en    873 non-null    object
 3   query_content_en  893 non-null    object
 4   author_id         896 non-null    object
 5   content_en        896 non-null    object
 6   content_zh        896 non-null    object
 7   content_es        896 non-null    object
dtypes: object(8)
memory usage: 63.0+ KB


In [None]:
train_data = data1[['image_ids', 'content_es']]
train_data

Unnamed: 0,image_ids,content_es
0,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...",Debería ser leucoplasia secundaria. Es tempora...
1,"['IMG_ENC00853_00001.jpg', 'IMG_ENC00853_00002...",Debería ocurrir en ambos lados. Piensa en ecc...
2,"['IMG_ENC00854_00001.jpg', 'IMG_ENC00854_00002...","En la primera imagen, considere la tiña manus,..."
3,"['IMG_ENC00855_00001.jpg', 'IMG_ENC00855_00002...",herpes zoster
4,['IMG_ENC00856_00001.jpg'],"eczema, puede aplicar kangfuxin externamente. ..."
...,...,...
893,['IMG_ENC00847_00001.jpg'],Sospecha de tumor
894,"['IMG_ENC00848_00001.jpg', 'IMG_ENC00848_00002...",Sugiere examen patológico\nExcluir MF
895,"['IMG_ENC00849_00001.jpg', 'IMG_ENC00849_00002...",Psoriasis
896,['IMG_ENC00850_00001.jpg'],"La ictiosis, puede ser aliviada clínicamente."


In [None]:
import pandas as pd

df = train_data

# Preprocessing
new_data = {'image_ids': [], 'content_es': []}
for idx, row in df.iterrows():
    image_ids = row['image_ids'].strip("[]").replace("'", "").split(", ")
    for image_id in image_ids:
        new_data['image_ids'].append(image_id.strip())
        new_data['content_es'].append(row['content_es'])

new_df = pd.DataFrame(new_data)

print(new_df)


                   image_ids  \
0     IMG_ENC00852_00001.jpg   
1     IMG_ENC00852_00002.jpg   
2     IMG_ENC00853_00001.jpg   
3     IMG_ENC00853_00002.jpg   
4     IMG_ENC00854_00001.jpg   
...                      ...   
2621  IMG_ENC00850_00001.jpg   
2622  IMG_ENC00851_00001.jpg   
2623  IMG_ENC00851_00002.jpg   
2624  IMG_ENC00851_00003.jpg   
2625  IMG_ENC00851_00004.jpg   

                                             content_es  
0     Debería ser leucoplasia secundaria. Es tempora...  
1     Debería ser leucoplasia secundaria. Es tempora...  
2     Debería ocurrir en ambos lados.  Piensa en ecc...  
3     Debería ocurrir en ambos lados.  Piensa en ecc...  
4     En la primera imagen, considere la tiña manus,...  
...                                                 ...  
2621      La ictiosis, puede ser aliviada clínicamente.  
2622  En esta situación, se deben tomar antihistamín...  
2623  En esta situación, se deben tomar antihistamín...  
2624  En esta situación, se deben t

In [None]:
new_df.content_es.nunique(dropna=True)


803

In [None]:
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from skimage.io import imread
from skimage.transform import resize
import pickle

'''
# Load CSV file containing image IDs and labels
csv_file_path = 'path_to_your_csv_file.csv'
df = pd.read_csv(csv_file_path)
'''
df=new_df
# Data directory
data_dir = 'gdrive/My Drive/images_final/images_train'  # Directory containing your images

# Load images and labels
X = []
y = []

for index, row in df.iterrows():
    image_path = os.path.join(data_dir, row['image_ids'])
    try:
        image = imread(image_path)
        image = resize(image, (100, 100))  # Resize image to reduce computation time
        X.append(image.flatten())  # Flatten image and append to features
        y.append(row['content_es'])  # Append label
    except Exception as e:
        print(f"Error loading image {row['image_ids']}: {e}")

# Define and train the SVM model
model = SVC(kernel='linear')
model.fit(X, y)

# Save the trained SVM model to a file
model_file_path = 'gdrive/My Drive/svm_modelM3G-es.pkl'
with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)

print("Model trained and saved successfully.")


Model trained and saved successfully.


In [None]:
import os
import pickle
from skimage.io import imread
from skimage.transform import resize

# Load the saved SVM model from the file
model_file_path = 'gdrive/My Drive/svm_modelM3G-es.pkl'
with open(model_file_path, 'rb') as file:
    loaded_model = pickle.load(file)

# Load the image you want to make predictions on
image_path = 'gdrive/My Drive/images_final/images_train/IMG_ENC00852_00002.jpg'  # Replace with the path to your image
try:
    image = imread(image_path)
    image = resize(image, (100, 100))  # Resize image to match the size used during training
except Exception as e:
    print(f"Error loading image: {e}")
    exit()

# Flatten the image and make predictions
flattened_image = image.flatten().reshape(1, -1)  # Reshape to 1 sample
predicted_class = loaded_model.predict(flattened_image)

print("Predicted class:", predicted_class[0])


Predicted class: Debería ser leucoplasia secundaria. Es temporal y no necesita tratamiento. Debería curarse sola en 6 meses a un año.
