In [None]:
import pandas as pd
gpt_df = pd.read_csv('../../data/new_final_meta.csv')

In [None]:
outfit_id_list = [(outfit_id) for outfit_id in gpt_df['outfit_id']]

In [None]:
outfit_df = gpt_df.copy()

In [None]:
outfit_df

## TF-IDF, cosine 유사도 구하기

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Tokenize the 'new_tags' column by comma and convert to a list of lists
gpt_df['new_tags'] = gpt_df['new_tags'].apply(lambda x: x.split(','))

# Create TfidfVectorizer and fit_transform the documents
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(gpt_df['new_tags'].apply(','.join))

# Compute cosine similarity between the documents
cosine_similarities = cosine_similarity(tfidf_matrix)

# Store similarities in a nested list
similarities = []
num_documents = len(gpt_df['new_tags'])
for i in range(num_documents):
    tmp_lst = []
    for j in range(num_documents):
        tmp_lst.append(cosine_similarities[i][j])
    similarities.append(tmp_lst)


In [None]:
similarities[1][:]

In [None]:
import numpy as np
import csv

# ... (previous code)

# Initialize a list to store the data
data = []

# Iterate over all items
for sentence_idx in range(len(outfit_id_list)):
    # Get the similarities for the specified item
    similarities_for_sentence = similarities[sentence_idx]

    # Sort the similarities in descending order and get the indices of the sorted similarities
    sorted_indices = np.argsort(similarities_for_sentence)[::-1]

    # Get the top 10 most similar item indices (excluding the original item)
    top_100_indices = sorted_indices[1:301]

    # Get the outfit_id for the current item
    outfit_id = outfit_id_list[sentence_idx]

    # Get the outfit_id for the top 10 most similar items
    similar_outfits = [int(outfit_id_list[idx]) for idx in top_100_indices]


    data.append([outfit_id, similar_outfits])

# Save the data to a CSV file
csv_filename = 'similar-gpt.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['outfit_id', 'similar_outfits'])

    # Write data rows to the CSV file
    for row in data:
        # Write the 'similar_outfits' list as a comma-separated string
        row[1] = ",".join(str(item) for item in row[1])
        csv_writer.writerow(row)

print(f"Data saved to {csv_filename}.")


## PostProcessing

In [None]:
df = pd.read_csv('./similar-gpt.csv')

In [None]:
import ast

# Assuming 'similar_outfits' column contains strings representing lists, e.g., "[101, 102, 103]"
# Convert the strings back to lists of integers
df['similar_outfits'] = df['similar_outfits'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Function to remove outfit_id if it exists in the 'similar_outfits' list
def remove_outfit_from_similar(outfit_id_list, similar_list):
    return [outfit_id for outfit_id in outfit_id_list if outfit_id not in similar_list]

# Apply the function to each row of the DataFrame
df['similar_outfits'] = df.apply(lambda row: remove_outfit_from_similar(row['similar_outfits'], [row['outfit_id']]), axis=1)

# Now, each row of the 'similar_outfits' column will be a list without the 'outfit_id' if it existed in the list before.


In [None]:
df

In [None]:
data = []
# Your existing outfit_df and df data...
outfit_id_list = df['outfit_id'].tolist()
cnt=0
for outfit_id in outfit_id_list:
    main_gender = gpt_df.loc[gpt_df['outfit_id'] == outfit_id, ['gender']].values[0][0]
    num =0
    main_cluster = gpt_df.loc[gpt_df['outfit_id'] == outfit_id, ['cluster']].values[0][0]
    similar_list=[]
    similar_outfits_list = df.loc[df['outfit_id'] == outfit_id, 'similar_outfits'].values[0]
    for similar_id in similar_outfits_list:
        similar_gender = gpt_df.loc[gpt_df['outfit_id'] == similar_id, ['gender']].values[0][0]
        similar_cluster = gpt_df.loc[gpt_df['outfit_id'] == similar_id, ['cluster']].values[0][0]

        if main_gender==similar_gender and outfit_id != similar_id and main_cluster==similar_cluster :
            similar_list.append(similar_id)
            num+=1
            
        if num==10:
            cnt+=1
            print(cnt)
            break
        
    data.append([outfit_id, similar_list])
    # data.append([outfit_id, "{" + ','.join(map(str, similar_outfits)) + "}"])

# Save the data to a CSV file
csv_filename = 'similar-gpt-filter.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['outfit_id', 'similar_outfits'])
    csv_writer.writerows(data)

print(f"Data saved to {csv_filename}.")


## 중복 확인

In [None]:
filter =pd.read_csv('./similar-gpt-filter.csv')

In [None]:
filter

In [None]:
import pandas as pd

# Assuming 'filter' is the DataFrame containing 'similar_outfits' column with comma-separated strings within square brackets
# Convert comma-separated strings to lists of integers
filter['similar_outfits'] = filter['similar_outfits'].apply(lambda x: [int(outfit_id) for outfit_id in x[1:-1].split(',') if outfit_id.strip()])

# Calculate the length of the lists in 'similar_outfits' column and create a new column 'length_of_similar_outfits'
filter['length_of_similar_outfits'] = filter['similar_outfits'].apply(len)

# Now, the DataFrame 'filter' will have a new column 'length_of_similar_outfits' containing the lengths of the lists without square brackets.


In [None]:
filter.describe()

In [None]:
# 해당 outfit_id가 similar_outfits 리스트에 있는지 확인하는 함수
def is_outfit_id_in_similar(row):
    return row['outfit_id'] in row['similar_outfits']

# 새로운 열 'is_in_similar'을 생성하고 확인 결과를 저장
filter['is_in_similar'] = filter.apply(is_outfit_id_in_similar, axis=1)
print(filter['is_in_similar'].sum())