In [None]:
!pip install pandas

In [None]:
import pandas as pd

In [None]:
outfit_df = pd.read_csv('../../data/outfit.csv')

In [None]:
# Convert the outfit_id column into a list of tuples (outfit_id, reporter)
outfit_id_list = [(outfit_id) for outfit_id in outfit_df['outfit_id']]
outfit_reporter_list = [(reporter) for reporter in outfit_df['reporter']]
print(outfit_id_list)
print(outfit_reporter_list)


In [None]:
!pip install --upgrade jpype1

In [None]:
import jpype
print(jpype.isJVMStarted()) #return False:not running or 0:running

In [None]:
!apt-get update

!apt-get install g++ openjdk-8-jdk python-dev python3-dev -y

!pip3 install JPype1-py3

!pip3 install konlpy

!JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
!pip install scikit-learn

In [None]:
!pip install tqdm

In [None]:
from konlpy.tag import Kkma
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# 꼬꼬마 형태소 분석기 초기화
kkma = Kkma()

kkma_list=[]

# 문장을 형태소 단위로 분리하는 함수
def tokenize(text):
    tokenized_text = ' '.join(kkma.morphs(text))
    # print(f"Original sentence: {text}")
    # print(f"Tokenized sentence: {tokenized_text}")
    # print()
    kkma_list.append(tokenized_text)
    return tokenized_text

# 문장을 형태소 단위로 분리하여 벡터화
vectorizer = TfidfVectorizer(tokenizer=tokenize)
tfidf_matrix = vectorizer.fit_transform(outfit_reporter_list)
# print(tfidf_matrix)
# 문장들 간의 유사도 계산
similarities = cosine_similarity(tfidf_matrix)

# 결과 출력 with tqdm
num_sentences = len(outfit_reporter_list)
with tqdm(total=num_sentences*(num_sentences-1)//2) as pbar:
    for i in range(len(outfit_reporter_list)):
        for j in range(i + 1, len(outfit_reporter_list)):
            similarity = similarities[i][j]
            # print(f"문장 {i+1}과 문장 {j+1}의 유사도: {similarity}")
            pbar.update(1)
            



## 결과

In [None]:
for i in kkma_list:
    for j in i:
        print(j, end ="")
    print()

In [None]:
import numpy as np

# ... (previous code)

# Replace outfit_number with the specific outfit number for which you want to find similar outfits
outfit_number = 87450

# Get the index of the outfit number in the outfit_id_list
outfit_index = outfit_id_list.index(outfit_number)

# Get the cosine similarity scores for the specified outfit
outfit_similarity_scores = similarities[outfit_index]

# Get the indices of the most similar outfits (excluding the same outfit)
similar_outfit_indices = np.argsort(outfit_similarity_scores)[::-1][1:11]

# Get the outfit numbers, outfit texts, and similarity scores for the top 10 most similar outfits
similar_outfits_info = [(outfit_id_list[idx], outfit_reporter_list[idx], outfit_similarity_scores[idx]) for idx in similar_outfit_indices]

print(f"메인 코디 번호: {outfit_number}")
print("Reporter:",outfit_reporter_list[outfit_index])
print("\nOutfit {outfit_number} is most similar to the following outfits:")

# Print similar outfits, their texts, and similarity scores
for similar_outfit_number, similar_outfit_text, similarity_score in similar_outfits_info:
    print(f"유사한 코디 번호: {similar_outfit_number}")
    print("Reporter:",similar_outfit_text)
    print(f"Similarity Score: {similarity_score}\n")

In [None]:
import numpy as np
import csv

# ... (previous code)

# Initialize a list to store the data
data = []

# Iterate over all items
for sentence_idx in range(len(outfit_reporter_list)):
    # Get the similarities for the specified item
    similarities_for_sentence = similarities[sentence_idx]

    # Sort the similarities in descending order and get the indices of the sorted similarities
    sorted_indices = np.argsort(similarities_for_sentence)[::-1]

    # Get the top 10 most similar item indices (excluding the original item)
    top_100_indices = sorted_indices[1:301]

    # Get the outfit_id for the current item
    outfit_id = outfit_id_list[sentence_idx]

    # Get the outfit_id for the top 10 most similar items
    similar_outfits = [int(outfit_id_list[idx]) for idx in top_100_indices]
    # print(type(similar_outfits[0]))
    # Append the data to the list
    data.append([outfit_id, similar_outfits])
    # data.append([outfit_id, "{" + ','.join(map(str, similar_outfits)) + "}"])

# Save the data to a CSV file
csv_filename = 'similarwithkkma.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['outfit_id', 'similar_outfits'])

    # Write data rows to the CSV file
    for row in data:
        # Write the 'similar_outfits' list as a comma-separated string
        row[1] = ",".join(str(item) for item in row[1])
        csv_writer.writerow(row)

print(f"Data saved to {csv_filename}.")


In [None]:
# import webbrowser

# outfit_ids = [87349,85964,84352,87326,85777,82985,78318,91079,87196,89012,86647]


# base_url = 'https://stylesjourney.com/detail/'

# for outfit_id in outfit_ids:
#     url = base_url + str(outfit_id)
#     webbrowser.open(url)


In [None]:
df = pd.read_csv('./similarwithkkma.csv')

In [None]:
df

In [None]:
# 'tags' 열의 텍스트에 따라 'season' 열을 생성하는 함수
def map_tags_to_season(tags_text):
    if '봄' in tags_text :
        return 1
    elif '여름' in tags_text:
        return 2
    elif '가을' in tags_text:
        return 3
    elif '겨울' in tags_text:
        return 4
    
    else:
        return None

# 'tags' 열을 기반으로 'season' 열 생성
outfit_df['season'] = outfit_df['tags'].apply(map_tags_to_season)

In [None]:
outfit_df.isna().sum()

In [None]:
outfit_ids_with_nan_season = outfit_df[outfit_df['season'].isnull()]['outfit_id'].tolist()
outfit_ids_with_nan_season

In [None]:
outfit_df.loc[outfit_df['outfit_id']==86019, 'season'] = 3
outfit_df.loc[outfit_df['outfit_id']==86018, 'season'] = 3
outfit_df.loc[outfit_df['outfit_id']==91266, 'season'] = 1
outfit_df.loc[outfit_df['outfit_id']==85922, 'season'] = 4

In [None]:
outfit_df.nunique()

In [None]:
import pandas as pd
import csv

# Initialize a list to store the data
data = []
# Your existing outfit_df and df data...
outfit_id_list = df['outfit_id'].tolist()
cnt=0
for outfit_id in outfit_id_list:
    main_gender, main_season = outfit_df.loc[outfit_df['outfit_id'] == outfit_id, ['gender', 'season']].values[0]
    num =0
    similar_list=[]
    similar_outfits_str = df.loc[df['outfit_id'] == outfit_id, 'similar_outfits'].values[0]
    similar_outfits_list = [int(item) for item in similar_outfits_str.split(',')]
    for similar_id in similar_outfits_list:
        similar_gender, similar_season = outfit_df.loc[outfit_df['outfit_id'] == similar_id, ['gender', 'season']].values[0]
        if main_gender==similar_gender and main_season==similar_season and outfit_id != similar_id:
            similar_list.append(similar_id)
            num+=1
            
        if num==5:
            cnt+=1
            print(cnt)
            break
        
    data.append([outfit_id, similar_list])
    # data.append([outfit_id, "{" + ','.join(map(str, similar_outfits)) + "}"])

# Save the data to a CSV file
csv_filename = 'similar_kkma-filter.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['outfit_id', 'similar_outfits'])
    csv_writer.writerows(data)

print(f"Data saved to {csv_filename}.")


In [None]:
outfit_df['reporter']

In [None]:
filter = pd.read_csv('./similar_kkma-filter.csv')

In [None]:
filter

In [None]:
import pandas as pd

# Assuming 'filter' is the DataFrame containing 'similar_outfits' column with comma-separated strings within square brackets

# Convert comma-separated strings to lists of integers
filter['similar_outfits'] = filter['similar_outfits'].apply(lambda x: [int(outfit_id) for outfit_id in x[1:-1].split(',')])

# Calculate the length of the lists in 'similar_outfits' column and create a new column 'length_of_similar_outfits'
filter['length_of_similar_outfits'] = filter['similar_outfits'].apply(len)

# Now, the DataFrame 'filter' will have a new column 'length_of_similar_outfits' containing the lengths of the lists without square brackets.


In [None]:
filter.describe()

In [None]:
filter

In [None]:
# 해당 outfit_id가 similar_outfits 리스트에 있는지 확인하는 함수
def is_outfit_id_in_similar(row):
    return row['outfit_id'] in row['similar_outfits']

# 새로운 열 'is_in_similar'을 생성하고 확인 결과를 저장
filter['is_in_similar'] = filter.apply(is_outfit_id_in_similar, axis=1)

In [None]:
filter['is_in_similar'].sum()

In [None]:
filter['outfit_id']