In [4]:
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
import pandas as pd
import numpy as np
import requests
from io import BytesIO
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

# Load VGG16 model with pre-trained weights
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

In [5]:
# Function to preprocess and extract features from an image URL
def extract_features(img_url):
    try:
        response = requests.get(img_url)
        img = image.load_img(BytesIO(response.content), target_size=(224, 224))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        features = model.predict(img_data)
        return features.flatten()
    except:
        return None

# Function to prepare image similarity matrix
def prepare_image_similarity(df):
    # Extract features for all images
    df['features'] = df['image'].apply(extract_features)
    
    # Filter out rows where features could not be extracted
    df = df[df['features'].notnull()]
    
    # Calculate cosine similarity matrix
    feature_matrix = np.vstack(df['features'].values)
    image_cosine_sim = cosine_similarity(feature_matrix)
    
    return image_cosine_sim, df

# Function to get similar images
def get_similar_images(title, df, image_cosine_sim):
    if title not in df['title'].values:
        matches = process.extract(title, df['title'], limit=5)
        best_match = matches[0][0]
        print(f"Exact title not found. Using closest match: '{best_match}'")
        title = best_match
        
    idx = df[df['title'] == title].index[0]
    
    sim_scores = list(enumerate(image_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = [score for score in sim_scores if score[1] < 1]

    article_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    valid_indices = [i for i in article_indices if i < len(df)]
    
    similarity_df = pd.DataFrame({'title': df.iloc[valid_indices]['title'], 'image_similarity': scores})
    result_df = pd.merge(similarity_df, df, on='title')
    
    return result_df

In [6]:
df_westmanga = pd.read_csv("./data/westmanga.csv")

In [7]:
image_cosine_sim_westmanga , df_westmanga  = prepare_image_similarity(df_westmanga)



In [None]:
df_westmanga = get_similar_images('Fairy Tail', df_westmanga, image_cosine_sim_westmanga)