<a href="https://colab.research.google.com/github/Haeunoo/Data-Analysis-with-Open-Source/blob/main/%EC%98%A4%ED%94%88%EC%86%8C%EC%8A%A4_%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%B6%84%EC%84%9D_14%EA%B0%95_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 14강 비정형 데이터 분석 : 패션 사진 데이터 활용

### 목표

- 비정형 데이터를 인공지능 모델로 분석하여 실무에서 활용 가능한 보고서 형태로 가공

- 패션 트렌드라는 구체적인 주제를 통해, 비정형 데이터 분석의 실질적인 활용 방안을 경험하고자 함


### 분석 프로세스 개요

1. 데이터 수집
  - requests를 이용한 RSS 데이터 수집
  - lxml을 이용한 XML 파싱
  - 이미지 데이터 추출
2. VLM을 이용한 이미지 분석
  - 프롬프트를 이용한 이미지 필터링
  - 프롬프트를 이용한 스타일 분석
3. LLM을 이용한 키워드 분석 및 보고서 작성
  - 텍스트 전처리
  - 색상 및 스타일 키워드 추출
  - 워드 클라우드 분석
  - 보고서 작성

# 주의 : 런타임 GPU 로 설정 필요

In [1]:
# 4bit VLM 처리를 위한 bitsandbytes 설치
# LLM 처리를 위한 VLLM 설치 (오래걸리는 작업(>5분)이므로 미리 실행!)
!pip install bitsandbytes==0.45.3 vllm==0.7.3 transformers==4.48.2
# 필요 시 세션 재시작

Collecting bitsandbytes==0.45.3
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting vllm==0.7.3
  Downloading vllm-0.7.3-cp38-abi3-manylinux1_x86_64.whl.metadata (25 kB)
Collecting transformers==4.48.2
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.17 (from bitsandbytes==0.45.3)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting blake3 (from vllm==0.7.3)
  Downloading blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm==0.7.3)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadat

In [1]:
# 한글 처리를 위한 matplotlib 설정 (1)

!sudo apt-get install -y fonts-nanum
!sudo fc-cache –fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 2s (4,880 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 126675 files and dire

- 런타임 -> 세션 다시 시작

In [1]:
# 한글 처리를 위한 matplotlib 설정 (2)

import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')

# 1. 데이터 수집 및 전처리

## 14-1 RSS 피드에서 이미지 URL 추출

In [2]:
import requests
from lxml import etree
from lxml.html import fromstring
import pandas as pd

def extract_unique_images(rss_url):
    ## 주어진 RSS 피드 URL에서 고유한 이미지 URL들을 추출하는 함수 정의
    try:
        ## requests 라이브러리를 사용하여 RSS 피드 URL로부터 내용을 가져옴
        response = requests.get(rss_url)
        ## 가져온 XML 응답 내용을 lxml의 etree.fromstring으로 파싱하여 XML 트리 root를 생성
        root = etree.fromstring(response.content)
        image_urls = set()

        ## XML 트리에서 모든 'item' 태그를 XPath를 사용하여 순회
        for item in root.xpath('//item'):
            description = item.find('description')
            if description is not None and description.text:
                ## description의 텍스트 내용을 lxml.html.fromstring으로 파싱하여 HTML 트리를 생성
                html_tree = fromstring(description.text)
                ## HTML 트리에서 첫 번째 <img> 태그의 'src' 속성 값을 XPath를 사용하여 추출
                img_url = html_tree.xpath('string(//img/@src)')
                if img_url:
                    image_urls.add(img_url)

        return list(image_urls)

    except Exception as e:
        ## 오류 발생 시 오류 메시지를 출력하고 빈 리스트를 반환
        print(f"Error occurred: {e}")
        return []

rss_url = "https://glltn.com/feed/"
## extract_unique_images 함수를 호출하여 고유한 이미지 URL들을 추출
unique_images = extract_unique_images(rss_url)

## 추출된 이미지 URL 리스트를 사용하여 'image'라는 열을 가진 pandas DataFrame을 생성
df = pd.DataFrame(unique_images, columns=["image"])

In [None]:
df

## 14-2 수집 데이터 확인

In [None]:
from IPython.display import display, HTML

def path_to_image_html(path):
    ## 이미지 경로를 HTML img 태그로 변환하는 함수
    return f'<img src="{path}" width="300" />'

## DataFrame의 스타일을 설정하여 이미지 너비를 300px로 지정
df.style.set_table_styles([{'selector': 'img', 'props': 'width: 300px;'}])

## DataFrame을 HTML로 변환하여 출력. 이미지 열은 path_to_image_html 함수로 포맷팅
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

## 2. VLM을 이용한 이미지 분석

## 14-3 VLM 모델 로드

In [4]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

## 'openbmb/MiniCPM-V-2_6-int4' 모델을 사전 훈련된 가중치와 함께 로드
## trust_remote_code=True는 허브에서 사용자 정의 코드를 실행할 수 있도록 허용
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
## 로드된 모델에 해당하는 토크나이저를 로드
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
## 모델을 평가 모드로 설정 (드롭아웃 등 훈련 시에만 필요한 기능 비활성화)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

configuration_minicpm.py: 0.00B [00:00, ?B/s]

modeling_navit_siglip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- modeling_navit_siglip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- configuration_minicpm.py
- modeling_navit_siglip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_minicpmv.py: 0.00B [00:00, ?B/s]

resampler.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- modeling_minicpmv.py
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenization_minicpmv_fast.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- tokenization_minicpmv_fast.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

MiniCPMV(
  (llm): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151666, 3584)
      (layers): ModuleList(
        (0-27): 28 x Qwen2DecoderLayer(
          (self_attn): Qwen2Attention(
            (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
            (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
            (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
            (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
            (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
            (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=

![](https://farm3.staticflickr.com/2677/4434956914_6e95a22940_z.jpg)

## 14-4 이미지 질문 응답 예시

In [5]:
from transformers import set_seed

## 재현성을 위해 시드(seed)를 42로 설정
set_seed(42)
## 예시 이미지 URL 정의
image_url = 'https://farm3.staticflickr.com/2677/4434956914_6e95a22940_z.jpg'
## requests로 이미지 다운로드 후 PIL Image 객체로 열고 RGB 형식으로 변환
image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
## 이미지에 대한 질문 정의
question = 'how many cats in the photo?'
## 모델 입력 형식에 맞춰 메시지 구성 (이미지와 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 이미지와 질문에 대한 응답 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 출력
print(result)

preprocessor_config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

processing_minicpmv.py: 0.00B [00:00, ?B/s]

image_processing_minicpmv.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- image_processing_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- processing_minicpmv.py
- image_processing_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


1


In [6]:
set_seed(42)
## 이미지에 대한 질문을 업데이트. 책 표지의 고양이도 포함하도록 요청
question = 'how many cats in the photo? including the books cover.'
## 모델 입력 형식에 맞춰 메시지 구성 (이전에 로드된 이미지와 업데이트된 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 업데이트된 질문에 대한 응답 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 출력
print(result)

1


In [7]:
set_seed(42)
## 이미지에 대한 질문을 'describe the photo'로 설정하여 이미지 내용을 설명하도록 요청
question = 'describe the photo'
## 모델 입력 형식에 맞춰 메시지 구성 (이전에 로드된 이미지와 설명 요청 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 이미지에 대한 설명을 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 (이미지 설명) 출력
print(result)

The photo shows a book with the title "why dogs are better than cats" and an image of a cat sitting on top of a dog's head. The book is placed on a flat surface, and next to it stands a real cat that appears to be looking at the book cover with some curiosity or disapproval.


## 14-5 의류 이미지 여부 판단

In [10]:
def is_picture_of_clothing(image_url):
    ## 이미지 URL이 의류 사진인지 판단하는 함수
    # 의류가 포함된 사진인지 확인하는 질문 작성 (영어로)
    question = 'Is this a picture of clothing? MUST say yes or no.'
    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, question]}]
    result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, temperature=0.1)
    print(result)
    ## 응답에 'yes'가 포함되어 있는지 확인하여 True/False 반환
    return 'yes' in result.lower()

## DataFrame의 'image' 열에 함수를 적용하여 'is_clothing' 열에 결과 저장
df['is_clothing'] = df['image'].apply(is_picture_of_clothing)

Yes.
Yes, this image is of clothing. It appears to be a fashion shoot showcasing winter outerwear, specifically parkas or coats designed for cold weather. The focus on the garments and the models' poses suggest that the purpose of the photograph is to display these items of clothing in a way that highlights their design and functionality.
Yes, this image is of clothing. It appears to be a fashion photograph showcasing the individual's attire, which includes a black blazer and tie-dye trousers. The focus on the outfit suggests that it may be used for promotional or retail purposes, highlighting the style and design elements of the garments.
No, the image is not of clothing. It appears to be a book cover with an abstract art design on it. The presence of text and what seems to be a signature suggests that this item could indeed be a piece of literature or a journal rather than apparel.
Yes, this image is of clothing. It features a person modeling what appears to be a casual shirt and jea

## 14-6 의류 판단 결과 시각화

In [9]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing
0,,True
1,,False
2,,False
3,,False
4,,False
5,,False
6,,False
7,,False
8,,False
9,,False


## 14-7 의류 이미지 필터링

In [11]:
## 'is_clothing' 열의 값이 True인 행들만 필터링하여 DataFrame을 업데이트
df = df[df['is_clothing']]

In [None]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

## 14-8 의류 스타일 분석

In [13]:
def describe_style(image_url):
    ## 주어진 이미지 URL의 의류 스타일을 분석하는 함수
    question = 'Analyze the style of the clothes. Please let me explain the colors and trend changes.'
    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, question]}]
    ## 모델의 chat 함수를 호출하여 이미지에 대한 스타일 분석 응답 생성
    result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
    return result

## 필터링된 DataFrame의 'image' 열에 describe_style 함수를 적용
## 결과는 'style'이라는 새로운 열에 저장
df['style'] = df['image'].apply(describe_style)

In [14]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing,style
0,,True,"The style of the clothes in the image suggests a casual, yet layered look that is both functional and fashionable. The olive green jacket has a utilitarian design with its quilted pattern, which is often associated with outdoor or military-inspired fashion. This color choice leans towards earth tones, which are popular for their versatility and timeless appeal.\n\nUnderneath, the layering with a brown vest adds depth to the outfit, creating visual interest and warmth without bulk. The use of neutral colors like brown and beige in this combination further emphasizes a classic and understated aesthetic. The light blue shirt introduces a subtle contrast, breaking up the dominance of darker hues while maintaining a cohesive palette.\n\nOverall, the clothing style appears to be influenced by contemporary trends that favor comfort and practicality, combined with an appreciation for muted, natural colors that can easily transition between seasons. The relaxed fit of the pants complements the laid-back vibe of the ensemble, suggesting a preference for comfortable yet stylish attire suitable for various casual settings."
1,,True,"The style of the clothes in the image leans towards a minimalist and utilitarian aesthetic, which is characterized by simple lines, functional design, and neutral colors. The olive green jackets are likely designed for practicality and warmth, possibly indicating that they are intended for cold weather use. The choice of muted colors such as olive green and grey suggests a preference for understated elegance over bold fashion statements.\n\nIn terms of trend changes, this look reflects a shift away from overly embellished or trendy clothing to more classic and timeless pieces. There's an emphasis on comfort and functionality, which has been a growing trend in recent years, especially with the influence of streetwear and casual fashion becoming mainstream. This style is often seen in urban settings where practicality meets a sense of personal expression through subtle details like the color coordination and layering.\n\nOverall, the garments appear to be part of a collection that prioritizes durability, comfort, and versatility, making them suitable for everyday wear across various seasons."
2,,True,"The style of the clothes in the image suggests a blend of classic and modern fashion trends. The black blazer is a timeless piece, often associated with formal or semi-formal attire. Its high collar and large buttons add a contemporary twist to the traditional design, indicating a preference for updated, stylish interpretations of classic garments.\n\nThe tie-dye pants introduce an element of casualness and creativity into the outfit. Tie-dye patterns are often linked to bohemian and artistic styles, suggesting that the wearer might have an inclination towards expressive and unconventional fashion choices. This combination of a structured blazer with relaxed, colorful pants could be seen as a nod to streetwear influences within more formal wear.\n\nOverall, the clothing style depicted here appears to be a deliberate mix of formal and informal elements, reflecting a trend where traditional boundaries between different fashion categories are being blurred."
4,,True,"The style of the clothes worn by the individual in the image reflects a casual, yet thoughtfully put-together look. The off-white button-up shirt is a classic piece that has seen consistent popularity due to its versatility and timeless appeal. Such shirts are often associated with a relaxed, approachable aesthetic, suitable for both everyday wear and more semi-formal occasions when paired appropriately.\n\nThe light blue denim shorts add to the laid-back vibe, suggesting comfort and ease while maintaining a certain level of style. Denim as a material is enduringly fashionable, offering durability and a sense of ruggedness. The combination of these two pieces indicates an understanding of color coordination, where neutral tones like off-white and soft blues can be mixed together to create a harmonious look.\n\nIn terms of trend changes, the shirt's slightly oversized fit aligns with contemporary fashion trends that favor looser silhouettes over tight-fitting garments. This shift towards comfort and a more effortless style has been prominent in recent years. Moreover, the choice of materials—appearing to be lightweight fabrics for the shirt and possibly a softer denim for the shorts—points to a preference for breathable, comfortable clothing, which is increasingly valued in modern wardrobes.\n\nOverall, the outfit presented in the image suggests a blend of traditional fashion elements with current trends, resulting in a stylish yet uncomplicated appearance."
5,,True,"The sweater worn by the individual in the image exhibits a style that is often associated with Scandinavian or Nordic fashion, characterized by its use of natural colors and geometric patterns. The color palette includes muted earth tones such as beige, dark brown, and shades of green, which are commonly found in traditional Nordic sweaters known as ""Västgötatunnan"" or ""Sámi sweaters."" These garments typically feature intricate designs and motifs that have cultural significance, often representing elements of nature like trees, snowflakes, and reindeer.\n\nIn terms of trend changes, this style has seen a resurgence in popularity due to its cozy and rustic appeal, especially during colder seasons. The use of chunky knit patterns not only provides warmth but also adds texture to the garment, making it both functional and fashionable. This type of clothing can be considered timeless and versatile, suitable for various casual settings. It reflects a preference for comfort and simplicity over more elaborate or trendy designs, aligning with contemporary trends that favor sustainability and minimalism in fashion choices."
7,,True,"The style of the clothes in the image leans towards a minimalist and possibly utilitarian aesthetic. The dark color palette, consisting of black and shades of brown, is often associated with understated elegance and versatility. This choice of colors can be seen as timeless and can easily transition between seasons.\n\nThe rolled-up sleeves on the jacket suggest a casual yet intentional approach to fashion, indicating that the wearer may favor comfort without compromising on style. The lack of visible logos or patterns reinforces the minimalist trend, which has been popular for its clean lines and simplicity.\n\nThe belt cinching at the waist adds a functional element to the outfit, potentially serving both a practical purpose and an aesthetic one by breaking up the silhouette of the garment. This detail could also imply a nod to military-inspired fashion, where belts are commonly used not only for utility but also as a design feature.\n\nOverall, the clothing appears to blend functionality with a modern, sleek look, which aligns with contemporary fashion trends that value simplicity, quality materials, and subtle sophistication."
8,,True,"The clothing style in the image leans towards a casual, possibly bohemian or vintage-inspired aesthetic. The beige sweater with a textured finish suggests comfort and is reminiscent of styles popular in past decades such as the 1970s or 1980s. This type of sweater could be seen as part of a 'heritage' trend where classic garments are reimagined for contemporary wear.\n\nThe layered look with a striped undershirt adds depth to the outfit and plays into the current trend of layering for both style and functionality. Layering allows for versatility in dressing according to weather conditions and personal preference.\n\nThe navy blue shorts provide a stark contrast to the lighter tones of the sweater and hat, creating a balanced color palette that is both striking and harmonious. Shorts can indicate a transitional season like spring or early fall, when temperatures might be mild enough for such attire but not warm enough for lighter fabrics like shorts.\n\nOverall, the combination of these pieces reflects a laid-back yet thoughtfully put-together look that blends modern casual fashion with hints of retro influences."
9,,True,"The style of the clothes in the image leans towards a minimalist and possibly vintage-inspired aesthetic. The dark green jacket has a classic button-up design with a pointed collar, which is timeless and often associated with both casual and semi-formal wear. The color choice of deep green can be seen as versatile, offering a balance between boldness and subtlety.\n\nThe maroon turtleneck adds a layer of sophistication and could suggest an influence from fashion trends that favor rich, solid colors for depth and contrast. Turtlenecks have been popular at various times due to their ability to provide warmth while also being stylish, especially when paired with outerwear like jackets or coats.\n\nThe patterned pants introduce an element of texture and interest without overwhelming the overall look. Patterns such as dots or checks are often used in fashion to add visual appeal and break up solid blocks of color, suggesting a trend-conscious approach to styling.\n\nOverall, the outfit combines elements that could be seen as contemporary yet timeless, indicating a blend of modern fashion sensibilities with classic styles. This mix suggests a deliberate fashion choice aimed at creating a cohesive and fashionable appearance that respects traditional aesthetics while incorporating current trends."
10,,True,"The style of the shoes depicted in the image is indicative of outdoor or hiking footwear, which often prioritizes durability and comfort for rugged activities. The color palette consists of earth tones—greens and browns—which are commonly used in outdoor gear due to their natural appearance in forested environments and their ability to blend with nature.\n\nIn terms of clothing trends, earth tones have seen a resurgence in popularity, especially within the context of sustainable fashion where using natural colors without dyeing reduces environmental impact. This trend can be linked to broader movements towards minimalism and eco-consciousness in fashion. Additionally, the design elements such as reinforced stitching and sturdy soles suggest functionality over fashion, aligning with current consumer preferences for practicality and longevity in apparel choices.\n\nOverall, these shoes reflect a trend that values both aesthetic appeal inspired by nature and functional design suited for active use."
11,,True,"The clothing style depicted in the image is minimalist and monochromatic, which is often associated with contemporary fashion trends. The black color of the shirt suggests a preference for neutral tones that are versatile and can be easily paired with other colors or accessories. The Henley-style neckline adds a casual yet sophisticated touch to the outfit, while the long sleeves and relaxed fit indicate comfort as a priority.\n\nMonochromatic outfits like this one have seen a resurgence in popularity due to their ability to create a sleek and streamlined look without the distraction of multiple colors or patterns. This trend reflects a broader shift towards simplicity and subtlety in modern fashion, where less is more, and the focus is on quality materials and understated elegance rather than bold statements.\n\nIn terms of color trends, black has remained a staple in wardrobes for its timeless appeal and versatility. It's a color that never goes out of style because it works well with any skin tone and pairs effortlessly with almost any other hue. The choice of black here aligns with current fashion sensibilities that favor classic pieces with a modern twist.\n\nOverall, the attire in the image is indicative of a contemporary, urban aesthetic that values both style and comfort."


# 3. LLM을 이용한 키워드 분석 및 보고서 작성

## 14-9 언어 모델(LLM) 로드

In [None]:
from vllm import LLM, SamplingParams

## vLLM 라이브러리를 사용하여 'LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct' 모델을 로드
## gpu_memory_utilization은 GPU 메모리 사용 비율을 0.5로 설정
## max_model_len은 모델이 처리할 수 있는 최대 토큰 길이를 10000으로 설정
llm = LLM(model='LGAL-EXAONE/EXAONE-3.5-2.48-Instruct', gpu_memory_utilization=0.5, max_m)

## 14-10 색상 정보 추출

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

def extract_color(style):
  ## 주어진 스타일 설명 텍스트에서 색상을 한글로 추출하는 함수
  prompt = [
      {
          "role": "system",
          "content": "You are EXAONE model from LG AI Research, a helpful assistant."
      },
      {
          "role": "user",
          "content": f"다음의 글에서 색상을 한글로 추출해주세요. 색상 외에 다른 정보는 적지 말아주세요. \n{style}" # vlm이 작성한 글에서 색상 정보 추출, 한글로 번역하면서
      }
  ]
  ## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
  sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=1024)
  ## LLM 모델을 사용하여 프롬프트에 대한 응답 생성
  result = llm.chat(prompt, sampling_params)[0].outputs[0].text
  print(result)
  return result

## DataFrame의 'style' 열에 extract_color 함수를 적용
## 결과는 'color'라는 새로운 열에 저장
df['color'] = df['style'].apply(extract_color)

## 14-11 스타일 키워드 추출

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

def extract_style(style):
  ## 주어진 스타일 설명 텍스트에서 스타일 키워드를 한글로 추출하는 함수
  prompt = [
      {
          "role": "system",
          "content": "You are EXAONE model from LG AI Research, a helpful assistant."
      },
      {
          "role": "user",
          "content": f"다음의 글에서 스타일 키워드를 한글로 추출해주세요. 스타일 키워드 외에 다른 정보는 적지 말아주세요.\n{style}" # vlm이 작성한 글에서 스타일 키워드 추출, 한글로 번역하면서
      }
  ]
  ## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
  sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=1024)
  ## LLM 모델을 사용하여 프롬프트에 대한 응답 생성
  result = llm.chat(prompt, sampling_params)[0].outputs[0].text
  print(result)
  return result

## DataFrame의 'style' 열에 extract_style 함수를 적용 (함수 이름은 이전과 동일하지만 기능 변경)
## 결과는 'keyword'라는 새로운 열에 저장
df['keyword'] = df['style'].apply(extract_style)

In [None]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

## 14-12 텍스트 데이터 정제

In [None]:
import re

def clean_text(text):
    ## 텍스트에서 특수 문자 및 HTML 태그를 제거하고 소문자로 변환하는 함수
    if isinstance(text, str):
       ## 영문, 숫자, 한글, 공백을 제외한 모든 문자 제거
       text = re.sub(r'[^a-zA-Z0-9가-힣\s]', '', text)
       ## HTML 태그 제거
       text = re.sub(r'<[^>]*>', '', text)
       ## 텍스트를 소문자로 변환
       text = text.lower()
       return text
    else:
        return ""

## 'color' 열의 텍스트 데이터 정제
df['color'] = df['color'].apply(clean_text)
## 'keyword' 열의 텍스트 데이터 정제
df['keyword'] = df['keyword'].apply(clean_text)

## 14-13 워드 클라우드 생성 및 시각화

In [None]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def get_word_count(df):
    ## DataFrame의 'color'와 'keyword' 열에서 단어 빈도를 계산하는 함수
    if not df.empty:
        ## 'color' 열의 모든 단어를 리스트로 합침
        all_nouns = df['color'].apply(str.split).sum()
        ## 'keyword' 열의 모든 단어를 추가
        all_nouns += df['keyword'].apply(str.split).sum()
        ## '색상' 단어를 제외한 모든 단어를 필터링
        all_nouns = [word for word in all_nouns if word not in ['색상']]
        ## 단어 빈도를 Counter 객체로 반환
        return Counter(all_nouns)
    return Counter() ## DataFrame이 비어있으면 빈 Counter 반환

def create_wordcloud(word_count):
    ## 단어 빈도수를 기반으로 워드 클라우드를 생성하고 시각화하는 함수
    if not word_count: ## 단어 빈도가 없으면 워드클라우드 생성하지 않음
        print("No words to generate word cloud.")
        return

    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap='viridis',
        font_path='/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' ## 한글 폰트 경로 지정
        ).generate_from_frequencies(word_count)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off") ## 축 표시 제거
    plt.show() ## 워드 클라우드 출력

## DataFrame에서 단어 빈도 계산
word_count = get_word_count(df)
## 계산된 단어 빈도로 워드 클라우드 생성 및 시각화
create_wordcloud(word_count)

## 14-14 트렌드 분석 보고서 생성 프롬프트 구성 및 실행

## 14-15 분석 보고서 시각화

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

## 시스템 메시지로 시작하는 프롬프트 리스트 초기화
prompt = [
    {
        "role": "system",
        "content": "You are EXAONE model from LG AI Research, a helpful assistant."
    }
]
## DataFrame의 각 행을 순회하며 '스타일 노트'와 '이미지 URL'을 사용자 메시지로 추가
for row in df.itertuples():
  prompt.append({"role": "user", "content": f"스타일 노트: {row.style}\n이미지 url:{row.image}"})
## 마지막으로, 종합적인 트렌드 분석 보고서 작성을 요청하는 사용자 메시지 추가
## 보고서 제목, 내용의 전문성, 마크다운 형식, 예시 이미지 포함을 지시
prompt.append({"role": "user", "content": "주어진 스타일 노트를 토대로 종합적인 트랜드 방향의 분석 보고서를 작성해주세요. 보고서의 제목은 해외 룩북 스타일 분석입니다. 내용은 전문적이면서 명확하게 작성해주세요. 문서 형식은 markdown으로 만들어주세요."})

## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=4096)
## LLM 모델을 사용하여 구성된 프롬프트에 대한 응답 생성
result = llm.chat(prompt, sampling_params)[0].outputs[0].text

In [None]:
from IPython.display import display, Markdown

## LLM으로부터 생성된 결과(Markdown 형식의 보고서)를 Jupyter 환경에 표시
display(Markdown(result))