From 66e6b6da71d806f5e5975b2599d6b2e93977aadf Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Sun, 8 Mar 2026 15:17:36 +0900 Subject: [PATCH 1/7] =?UTF-8?q?fix=20:=20repo/query=20=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/web/package.json | 1 - packages/query/package.json | 27 -------------------- packages/query/src/index.ts | 3 --- packages/query/src/open-api/index.ts | 7 ------ packages/query/src/open-api/useComposer.ts | 29 ---------------------- packages/query/src/open-api/useLyricist.ts | 29 ---------------------- packages/query/src/open-api/useNo.ts | 29 ---------------------- packages/query/src/open-api/usePopular.ts | 27 -------------------- packages/query/src/open-api/useRelease.ts | 29 ---------------------- packages/query/src/open-api/useSinger.ts | 29 ---------------------- packages/query/src/open-api/useSong.ts | 29 ---------------------- packages/query/src/types.ts | 8 ------ packages/query/tsconfig.json | 19 -------------- pnpm-lock.yaml | 19 -------------- 14 files changed, 285 deletions(-) delete mode 100644 packages/query/package.json delete mode 100644 packages/query/src/index.ts delete mode 100644 packages/query/src/open-api/index.ts delete mode 100644 packages/query/src/open-api/useComposer.ts delete mode 100644 packages/query/src/open-api/useLyricist.ts delete mode 100644 packages/query/src/open-api/useNo.ts delete mode 100644 packages/query/src/open-api/usePopular.ts delete mode 100644 packages/query/src/open-api/useRelease.ts delete mode 100644 packages/query/src/open-api/useSinger.ts delete mode 100644 packages/query/src/open-api/useSong.ts delete mode 100644 packages/query/src/types.ts delete mode 100644 packages/query/tsconfig.json diff --git a/apps/web/package.json b/apps/web/package.json index 40214b9..e34844e 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -31,7 +31,6 @@ "@radix-ui/react-tabs": "^1.1.3", "@radix-ui/react-tooltip": "^1.2.6", "@repo/open-api": "workspace:*", - "@repo/query": "workspace:*", "@supabase/ssr": "^0.6.1", "@supabase/supabase-js": "^2.49.1", "@tanstack/react-query": "^5.68.0", diff --git a/packages/query/package.json b/packages/query/package.json deleted file mode 100644 index 92f5e60..0000000 --- a/packages/query/package.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "@repo/query", - "version": "1.0.0", - "description": "", - "main": "./src/index.ts", - "exports": { - ".": { - "import": "./dist/index.js", - "types": "./dist/index.d.ts" - } - }, - "type": "module", - "scripts": { - "build": "tsup src/index.ts --format esm,cjs --dts" - }, - "keywords": [], - "author": "", - "license": "ISC", - "dependencies": { - "@repo/open-api": "workspace:*", - "@tanstack/react-query": "^5.68.0" - }, - "devDependencies": { - "tsup": "^8.4.0", - "typescript": "^5.8.2" - } -} diff --git a/packages/query/src/index.ts b/packages/query/src/index.ts deleted file mode 100644 index 9a0d4ed..0000000 --- a/packages/query/src/index.ts +++ /dev/null @@ -1,3 +0,0 @@ -export * from './open-api'; - -export type { OpenAPIResponse } from './types'; diff --git a/packages/query/src/open-api/index.ts b/packages/query/src/open-api/index.ts deleted file mode 100644 index 637af92..0000000 --- a/packages/query/src/open-api/index.ts +++ /dev/null @@ -1,7 +0,0 @@ -export { default as useSong } from './useSong'; -export { default as useSinger } from './useSinger'; -export { default as useComposer } from './useComposer'; -export { default as useLyricist } from './useLyricist'; -export { default as useNo } from './useNo'; -export { default as useRelease } from './useRelease'; -export { default as usePopular } from './usePopular'; diff --git a/packages/query/src/open-api/useComposer.ts b/packages/query/src/open-api/useComposer.ts deleted file mode 100644 index 110439e..0000000 --- a/packages/query/src/open-api/useComposer.ts +++ /dev/null @@ -1,29 +0,0 @@ -import { useQuery } from '@tanstack/react-query'; -import { getComposer, Brand } from '@repo/open-api'; -import { OpenAPIResponse } from '../types'; - -interface GetComposerProps { - composer: string; - brand?: Brand; -} - -const useComposer = (props: GetComposerProps): OpenAPIResponse => { - const { composer, brand } = props; - - // queryKey를 위한 brandKey 생성 (없으면 'all' 사용) - const brandKey = brand || 'all'; - - const { data, isLoading, isError, error } = useQuery({ - queryKey: ['open', 'composer', composer, brandKey], - queryFn: () => getComposer({ composer, brand }), - }); - - return { - data, - isLoading, - isError, - error, - }; -}; - -export default useComposer; diff --git a/packages/query/src/open-api/useLyricist.ts b/packages/query/src/open-api/useLyricist.ts deleted file mode 100644 index c708905..0000000 --- a/packages/query/src/open-api/useLyricist.ts +++ /dev/null @@ -1,29 +0,0 @@ -import { useQuery } from '@tanstack/react-query'; -import { getLyricist, Brand } from '@repo/open-api'; -import { OpenAPIResponse } from '../types'; - -interface GetLyricistProps { - lyricist: string; - brand?: Brand; -} - -const useLyricist = (props: GetLyricistProps): OpenAPIResponse => { - const { lyricist, brand } = props; - - // queryKey를 위한 brandKey 생성 (없으면 'all' 사용) - const brandKey = brand || 'all'; - - const { data, isLoading, isError, error } = useQuery({ - queryKey: ['open', 'lyricist', lyricist, brandKey], - queryFn: () => getLyricist({ lyricist, brand }), - }); - - return { - data, - isLoading, - isError, - error, - }; -}; - -export default useLyricist; diff --git a/packages/query/src/open-api/useNo.ts b/packages/query/src/open-api/useNo.ts deleted file mode 100644 index a39c382..0000000 --- a/packages/query/src/open-api/useNo.ts +++ /dev/null @@ -1,29 +0,0 @@ -import { useQuery } from '@tanstack/react-query'; -import { getNo, Brand } from '@repo/open-api'; -import { OpenAPIResponse } from '../types'; - -interface GetNoProps { - no: string; - brand?: Brand; -} - -const useNo = (props: GetNoProps): OpenAPIResponse => { - const { no, brand } = props; - - // queryKey를 위한 brandKey 생성 (없으면 'all' 사용) - const brandKey = brand || 'all'; - - const { data, isLoading, isError, error } = useQuery({ - queryKey: ['open', 'no', no, brandKey], - queryFn: () => getNo({ no, brand }), - }); - - return { - data, - isLoading, - isError, - error, - }; -}; - -export default useNo; diff --git a/packages/query/src/open-api/usePopular.ts b/packages/query/src/open-api/usePopular.ts deleted file mode 100644 index fe1141c..0000000 --- a/packages/query/src/open-api/usePopular.ts +++ /dev/null @@ -1,27 +0,0 @@ -import { useQuery } from '@tanstack/react-query'; -import { getPopular, Brand, Period } from '@repo/open-api'; -import { OpenAPIResponse } from '../types'; - -interface GetPopularProps { - brand: Brand; - period: Period; -} - -const usePopular = (props: GetPopularProps): OpenAPIResponse => { - const { brand, period } = props; - - const { data, isLoading, isError, error } = useQuery({ - queryKey: ['open', 'popular', brand, period], - queryFn: () => getPopular({ brand, period }), - enabled: Boolean(brand) && Boolean(period), - }); - - return { - data, - isLoading, - isError, - error, - }; -}; - -export default usePopular; diff --git a/packages/query/src/open-api/useRelease.ts b/packages/query/src/open-api/useRelease.ts deleted file mode 100644 index 21d58e2..0000000 --- a/packages/query/src/open-api/useRelease.ts +++ /dev/null @@ -1,29 +0,0 @@ -import { useQuery } from '@tanstack/react-query'; -import { getRelease, Brand } from '@repo/open-api'; -import { OpenAPIResponse } from '../types'; - -interface GetReleaseProps { - release: string; - brand?: Brand; -} - -const useRelease = (props: GetReleaseProps): OpenAPIResponse => { - const { release, brand } = props; - - // queryKey를 위한 brandKey 생성 (없으면 'all' 사용) - const brandKey = brand || 'all'; - - const { data, isLoading, isError, error } = useQuery({ - queryKey: ['open', 'release', release, brandKey], - queryFn: () => getRelease({ release, brand }), - }); - - return { - data, - isLoading, - isError, - error, - }; -}; - -export default useRelease; diff --git a/packages/query/src/open-api/useSinger.ts b/packages/query/src/open-api/useSinger.ts deleted file mode 100644 index d0c00eb..0000000 --- a/packages/query/src/open-api/useSinger.ts +++ /dev/null @@ -1,29 +0,0 @@ -import { useQuery } from '@tanstack/react-query'; -import { getSinger, Brand } from '@repo/open-api'; -import { OpenAPIResponse } from '../types'; - -interface GetSingerProps { - singer: string; - brand?: Brand; -} - -const useSinger = (props: GetSingerProps): OpenAPIResponse => { - const { singer, brand } = props; - - // queryKey를 위한 brandKey 생성 (없으면 'all' 사용) - const brandKey = brand || 'all'; - - const { data, isLoading, isError, error } = useQuery({ - queryKey: ['open', 'singer', singer, brandKey], - queryFn: () => getSinger({ singer, brand }), - }); - - return { - data, - isLoading, - isError, - error, - }; -}; - -export default useSinger; diff --git a/packages/query/src/open-api/useSong.ts b/packages/query/src/open-api/useSong.ts deleted file mode 100644 index 98c8881..0000000 --- a/packages/query/src/open-api/useSong.ts +++ /dev/null @@ -1,29 +0,0 @@ -import { useQuery } from '@tanstack/react-query'; -import { getSong, Brand, ResponseType } from '@repo/open-api'; -import { OpenAPIResponse } from '../types'; - -interface GetSongProps { - title: string; - brand?: Brand; -} - -const useSong = (props: GetSongProps): OpenAPIResponse => { - const { title, brand } = props; - - // queryKey를 위한 brandKey 생성 (없으면 'all' 사용) - const brandKey = brand || 'all'; - - const { data, isLoading, isError, error } = useQuery({ - queryKey: ['open', 'song', title, brandKey], - queryFn: () => getSong({ title, brand }), - }); - - return { - data, - isLoading, - isError, - error, - }; -}; - -export default useSong; diff --git a/packages/query/src/types.ts b/packages/query/src/types.ts deleted file mode 100644 index 9193aae..0000000 --- a/packages/query/src/types.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { ResponseType } from '@repo/open-api'; - -export interface OpenAPIResponse { - data: ResponseType[] | null | undefined; - isLoading: boolean; - isError: boolean; - error: Error | null; -} diff --git a/packages/query/tsconfig.json b/packages/query/tsconfig.json deleted file mode 100644 index 8b9089c..0000000 --- a/packages/query/tsconfig.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "compilerOptions": { - "target": "ESNext", - "module": "ESNext", - "lib": ["ESNext", "esnext.asynciterable"], - "moduleResolution": "node", - "esModuleInterop": true, - "skipLibCheck": true, - "strict": true, - "forceConsistentCasingInFileNames": true, - "outDir": "dist", - "rootDir": "src", - "declaration": true, - "declarationDir": "./dist/types", // 디버깅 편의를 위해 추가 - "declarationMap": true // 디버깅 편의를 위해 추가 - }, - "include": ["src"], - "exclude": ["node_modules", "dist"] -} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e55eaf2..dc36869 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -187,9 +187,6 @@ importers: '@repo/open-api': specifier: workspace:* version: link:../../packages/open-api - '@repo/query': - specifier: workspace:* - version: link:../../packages/query '@supabase/ssr': specifier: ^0.6.1 version: 0.6.1(@supabase/supabase-js@2.89.0) @@ -425,22 +422,6 @@ importers: specifier: ^5.8.2 version: 5.9.3 - packages/query: - dependencies: - '@repo/open-api': - specifier: workspace:* - version: link:../open-api - '@tanstack/react-query': - specifier: ^5.68.0 - version: 5.90.16(react@19.2.3) - devDependencies: - tsup: - specifier: ^8.4.0 - version: 8.5.1(jiti@2.6.1)(postcss@8.5.6)(tsx@4.21.0)(typescript@5.9.3) - typescript: - specifier: ^5.8.2 - version: 5.9.3 - packages/typescript-config: {} packages/ui: From 8bd72f8d8adba31f59c35c9804cbf5700eb58c3d Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Sun, 8 Mar 2026 16:35:41 +0900 Subject: [PATCH 2/7] =?UTF-8?q?chore=20:=20crawlYoutubeTemp,=20crawlYoutub?= =?UTF-8?q?eUbuntu=20=EC=9E=84=EC=8B=9C=20=ED=8C=8C=EC=9D=BC=20=EB=B0=8F?= =?UTF-8?q?=20=EA=B4=80=EB=A0=A8=20=EC=8A=A4=ED=81=AC=EB=A6=BD=ED=8A=B8=20?= =?UTF-8?q?=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- packages/crawling/package.json | 1 - .../crawling/src/crawling/crawlYoutubeTemp.ts | 109 --------------- .../src/crawling/crawlYoutubeUbuntu.ts | 127 ------------------ 3 files changed, 237 deletions(-) delete mode 100644 packages/crawling/src/crawling/crawlYoutubeTemp.ts delete mode 100644 packages/crawling/src/crawling/crawlYoutubeUbuntu.ts diff --git a/packages/crawling/package.json b/packages/crawling/package.json index 5f43ebd..92b0417 100644 --- a/packages/crawling/package.json +++ b/packages/crawling/package.json @@ -9,7 +9,6 @@ "scripts": { "ky-open": "tsx src/findKYByOpen.ts", "ky-youtube": "tsx src/crawling/crawlYoutube.ts", - "ky-youtube-ubuntu": "tsx src/crawling/crawlYoutubeUbuntu.ts", "ky-valid": "tsx src/crawling/crawlYoutubeValid.ts", "ky-update": "pnpm run ky-youtube & pnpm run ky-valid", "trans": "tsx src/postTransDictionary.ts", diff --git a/packages/crawling/src/crawling/crawlYoutubeTemp.ts b/packages/crawling/src/crawling/crawlYoutubeTemp.ts deleted file mode 100644 index 2cf7075..0000000 --- a/packages/crawling/src/crawling/crawlYoutubeTemp.ts +++ /dev/null @@ -1,109 +0,0 @@ -import * as cheerio from 'cheerio'; -import puppeteer from 'puppeteer'; - -import { getInvalidKYSongsDB, getSongsKyNullDB } from '@/supabase/getDB'; -import { postInvalidKYSongsDB } from '@/supabase/postDB'; -import { updateSongsKyDB } from '@/supabase/updateDB'; -import { Song } from '@/types'; -import { saveCrawlYoutubeFailedKYSongs, updateDataLog } from '@/utils/logData'; - -import { isValidKYExistNumber } from './isValidKYExistNumber'; - -// youtube에서 KY 노래방 번호 크롤링 -// crawlYoutubeValid에서 진행하는 실제 사이트 검증도 포함 - -// action 우분투 환경에서의 호환을 위해 추가 -const browser = await puppeteer.launch({ - headless: true, -}); - -const page = await browser.newPage(); - -const baseUrl = 'https://www.youtube.com/@KARAOKEKY/search'; - -const scrapeSongNumber = async (query: string) => { - const searchUrl = `${baseUrl}?query=${encodeURIComponent(query)}`; - - // page.goto의 waitUntil 문제였음! - await page.goto(searchUrl, { - waitUntil: 'networkidle2', - // timeout: 0, - }); - - const html = await page.content(); - const $ = cheerio.load(html); - - // id contents 의 첫번째 ytd-item-section-renderer 찾기 - // const firstItem = $("#contents ytd-item-section-renderer").first(); - - const firstItem = $('ytd-video-renderer').first(); - - // yt-formatted-string 찾기 - const title = firstItem.find('yt-formatted-string').first().text().trim(); - - const karaokeNumber = extractKaraokeNumber(title); - - return karaokeNumber; -}; - -const extractKaraokeNumber = (title: string) => { - // KY. 찾고 ) 가 올때까지 찾기 - const matchResult = title.match(/KY\.\s*(\d{2,5})\)/); - const karaokeNumber = matchResult ? matchResult[1] : null; - return karaokeNumber; -}; - -const updateData = async (data: Song) => { - const result = await updateSongsKyDB(data); - console.log(result); - updateDataLog(result.success, 'crawlYoutubeSuccess.txt'); - updateDataLog(result.failed, 'crawlYoutubeFailed.txt'); -}; - -// failedSongs을 가져와서 실패한 노래를 건너뛰는 게 아니라 실패 시 update_date를 수정해 작업 순위를 뒤로 미룬다면? -const data = await getSongsKyNullDB(); -const failedSongs = await getInvalidKYSongsDB(); - -console.log('getSongsKyNullDB : ', data.length); -console.log('failedSongs : ', failedSongs.length); -let index = 0; -let successCount = 0; - -for (const song of data) { - if (failedSongs.find(failedSong => failedSong.id === song.id)) { - continue; - } - const query = song.title + '-' + song.artist; - - let resultKyNum = null; - try { - resultKyNum = await scrapeSongNumber(query); - } catch (error) { - continue; - } - - if (resultKyNum) { - let isValid = true; - try { - isValid = await isValidKYExistNumber(page, resultKyNum, song.title, song.artist); - } catch (error) { - continue; - } - - if (!isValid) { - await postInvalidKYSongsDB(song); - continue; - } else { - await updateData({ ...song, num_ky: resultKyNum }); - console.log('update song : ', resultKyNum); - successCount++; - } - } else await postInvalidKYSongsDB(song); - - index++; - console.log(query); - console.log('scrapeSongNumber : ', index); - console.log('successCount : ', successCount); -} - -browser.close(); diff --git a/packages/crawling/src/crawling/crawlYoutubeUbuntu.ts b/packages/crawling/src/crawling/crawlYoutubeUbuntu.ts deleted file mode 100644 index 4e7c223..0000000 --- a/packages/crawling/src/crawling/crawlYoutubeUbuntu.ts +++ /dev/null @@ -1,127 +0,0 @@ -import * as cheerio from 'cheerio'; -import puppeteer from 'puppeteer'; - -import { getSongsKyNullDB } from '@/supabase/getDB'; -import { updateSongsKyDB } from '@/supabase/updateDB'; -import { Song } from '@/types'; -import { - loadCrawlYoutubeFailedKYSongs, - saveCrawlYoutubeFailedKYSongs, - updateDataLog, -} from '@/utils/logData'; - -import { isValidKYExistNumber } from './isValidKYExistNumber'; - -// youtube에서 KY 노래방 번호 크롤링 -// crawlYoutubeValid에서 진행하는 실제 사이트 검증도 포함 - -// action 우분투 환경에서의 호환을 위해 추가 -// const browser = await puppeteer.launch({ -// headless: true, -// executablePath: '/usr/bin/chromium-browser', // 또는 "/usr/bin/chromium" -// args: [ -// '--no-sandbox', -// '--disable-setuid-sandbox', -// '--disable-dev-shm-usage', // 리눅스 메모리 제한 대응 -// '--disable-gpu', -// '--disable-infobars', -// '--single-process', -// '--window-size=1920,1080', -// ], -// }); - -const browser = await puppeteer.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'], -}); - -const page = await browser.newPage(); - -const baseUrl = 'https://www.youtube.com/@KARAOKEKY/search'; - -const scrapeSongNumber = async (query: string) => { - const searchUrl = `${baseUrl}?query=${encodeURIComponent(query)}`; - - // page.goto의 waitUntil 문제였음! - await page.goto(searchUrl, { - waitUntil: 'networkidle2', - timeout: 0, - }); - - const html = await page.content(); - const $ = cheerio.load(html); - - // id contents 의 첫번째 ytd-item-section-renderer 찾기 - // const firstItem = $("#contents ytd-item-section-renderer").first(); - - const firstItem = $('ytd-video-renderer').first(); - - // yt-formatted-string 찾기 - const title = firstItem.find('yt-formatted-string').first().text().trim(); - - const karaokeNumber = extractKaraokeNumber(title); - - return karaokeNumber; -}; - -const extractKaraokeNumber = (title: string) => { - // KY. 찾고 ) 가 올때까지 찾기 - const matchResult = title.match(/KY\.\s*(\d{2,5})\)/); - const karaokeNumber = matchResult ? matchResult[1] : null; - return karaokeNumber; -}; - -const updateData = async (data: Song) => { - const result = await updateSongsKyDB(data); - updateDataLog(result.success, 'crawlYoutubeSuccess.txt'); - updateDataLog(result.failed, 'crawlYoutubeFailed.txt'); -}; - -const data = await getSongsKyNullDB(); -const failedSongs = loadCrawlYoutubeFailedKYSongs(); - -console.log('getSongsKyNullDB : ', data.length); -let index = 0; - -for (const song of data) { - // 테스트를 위해 100회 반복 후 종료시키기 - if (index >= 100) { - break; - } - - const query = song.title + '-' + song.artist; - - if (failedSongs.has(query)) { - continue; - } - - console.log(song.title, ' - ', song.artist); - - let resultKyNum = null; - try { - resultKyNum = await scrapeSongNumber(query); - } catch (error) { - continue; - } - - if (resultKyNum) { - let isValid = true; - try { - isValid = await isValidKYExistNumber(page, resultKyNum, song.title, song.artist); - } catch (error) { - continue; - } - - if (!isValid) { - saveCrawlYoutubeFailedKYSongs(song.title, song.artist); - continue; - } else { - await updateData({ ...song, num_ky: resultKyNum }); - } - } else saveCrawlYoutubeFailedKYSongs(song.title, song.artist); - - index++; - console.log('scrapeSongNumber : ', index); -} - -browser.close(); From 35bd2419a35e007d456b0f1e405661a8931b3a3c Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Sun, 8 Mar 2026 16:35:50 +0900 Subject: [PATCH 3/7] =?UTF-8?q?feat=20:=20isValidKYExistNumber=20AI=20?= =?UTF-8?q?=EA=B8=B0=EB=B0=98=20=EA=B3=A1=20=EA=B2=80=EC=A6=9D=EC=9C=BC?= =?UTF-8?q?=EB=A1=9C=20=EA=B5=90=EC=B2=B4=20=EB=B0=8F=20validateSongMatch?= =?UTF-8?q?=20=EC=9C=A0=ED=8B=B8=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../src/crawling/isValidKYExistNumber.ts | 40 ++++++---------- .../crawling/src/utils/validateSongMatch.ts | 46 +++++++++++++++++++ 2 files changed, 60 insertions(+), 26 deletions(-) create mode 100644 packages/crawling/src/utils/validateSongMatch.ts diff --git a/packages/crawling/src/crawling/isValidKYExistNumber.ts b/packages/crawling/src/crawling/isValidKYExistNumber.ts index 3ba8446..3b8dc58 100644 --- a/packages/crawling/src/crawling/isValidKYExistNumber.ts +++ b/packages/crawling/src/crawling/isValidKYExistNumber.ts @@ -1,20 +1,13 @@ import * as cheerio from 'cheerio'; import { Page } from 'puppeteer'; - -const parseText = (text: string) => { - // 모두 소문자로 - // 공백은 제거 - // 괄호 제거 - - return text.toLowerCase().replace(/\s/g, '').replace(/\(/g, '').replace(/\)/g, ''); -}; +import { validateSongMatch } from '../utils/validateSongMatch'; export const isValidKYExistNumber = async ( page: Page, number: string, title: string, artist: string, -) => { +): Promise => { const kyUrl = 'https://kysing.kr/search/?category=1&keyword='; const searchUrl = kyUrl + number; @@ -26,25 +19,20 @@ export const isValidKYExistNumber = async ( const html = await page.content(); const $ = cheerio.load(html); - const parsedTitle = parseText(title); - const parsedArtist = parseText(artist); + const titleResult = $('.search_chart_tit').find('.tit').eq(0).text().trim(); + const artistResult = $('.search_chart_tit').find('.tit').eq(1).text().trim(); - const titleResult = parseText($('.search_chart_tit').find('.tit').eq(0).text().trim()); - const artistResult = parseText($('.search_chart_tit').find('.tit').eq(1).text().trim()); + if (!titleResult || !artistResult) { + console.log('❌ KY 검색 결과 없음'); + return false; + } - // artistResult가 parsedArtist를 포함하는지 검증 - // 표기의 오류가 있을 수 있기에 parsedTitle, parsedArtist를 (0, 2) / (-2)로 slice하여 비교 + console.log(`검색 쿼리 : ${title} - ${artist}`); + console.log(`KY 노래방 검색 결과 : ${titleResult} - ${artistResult}`); - if ( - (titleResult.includes(parsedTitle.slice(0, 2)) || - titleResult.includes(parsedTitle.slice(-2))) && - (artistResult.includes(parsedArtist.slice(0, 2)) || - artistResult.includes(parsedArtist.slice(-2))) - ) { - return true; - } + const isValid = await validateSongMatch(title, artist, titleResult, artistResult); + + console.log(`AI 검증 결과 : ${isValid ? '✅ 일치' : '❌ 불일치'}`); - console.log('검색 쿼리 : ', parsedTitle, ' - ', parsedArtist); - console.log('KY 노래방 검색 결과 : ', titleResult, ' - ', artistResult); - return false; + return isValid; }; diff --git a/packages/crawling/src/utils/validateSongMatch.ts b/packages/crawling/src/utils/validateSongMatch.ts new file mode 100644 index 0000000..a355ee2 --- /dev/null +++ b/packages/crawling/src/utils/validateSongMatch.ts @@ -0,0 +1,46 @@ +import dotenv from 'dotenv'; +import OpenAI from 'openai'; + +dotenv.config(); + +const client = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, +}); + +/** + * AI를 활용해 두 (제목, 아티스트) 쌍이 같은 곡인지 판단한다. + * 표기 차이(띄어쓰기, 영문/한글 혼용, 특수문자 등)는 같은 곡으로 허용한다. + */ +export const validateSongMatch = async ( + inputTitle: string, + inputArtist: string, + foundTitle: string, + foundArtist: string, +): Promise => { + // 완전 일치 시 API 호출 없이 즉시 반환 + if (inputTitle === foundTitle && inputArtist === foundArtist) return true; + + const response = await client.chat.completions.create({ + model: 'gpt-4o-mini', + messages: [ + { + role: 'system', + content: + 'Decide if two (title, artist) pairs refer to the same song. Allow spelling variants (spaces, en/kr, case). Return JSON: {"isValid":boolean}', + }, + { + role: 'user', + content: `"${inputTitle}"(${inputArtist}) vs "${foundTitle}"(${foundArtist})`, + }, + ], + response_format: { type: 'json_object' }, + temperature: 0, + max_tokens: 20, + }); + + const content = response.choices[0].message.content; + if (!content) return false; + + const result: { isValid: boolean } = JSON.parse(content); + return result.isValid; +}; From ef80587c92c30839f8b0d611fdb844db6a4db1aa Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Sun, 8 Mar 2026 17:05:50 +0900 Subject: [PATCH 4/7] =?UTF-8?q?refactor=20:=20crawling=20log=20=EC=8B=9C?= =?UTF-8?q?=EC=8A=A4=ED=85=9C=20=EC=A0=95=EB=A6=AC=20=EB=B0=8F=20replaceSu?= =?UTF-8?q?pabaseFailed=20=EB=B2=84=EA=B7=B8=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - updateDataLog 및 log/ 폴더 관련 코드 전체 제거 (DB가 성공/실패 상태 관리) - saveCrawlYoutubeFailedKYSongs 제거 (crawlYoutube가 DB 방식으로 전환되어 미사용) - replaceSupabaseFailed에서 loadValidKYSongs → loadCrawlYoutubeFailedKYSongs 버그 수정 Co-Authored-By: Claude Sonnet 4.6 --- .../crawling/src/crawling/crawlRecentTJ.ts | 4 +- .../src/crawling/crawlYoutubeValid.ts | 6 +-- .../src/crawling/crawlYoutubeVerify.ts | 52 +++++++++++++++++++ .../src/crawling/replaceSupabaseFailed.ts | 4 +- packages/crawling/src/findKYByOpen.ts | 3 -- packages/crawling/src/postAllOpenSongs.ts | 18 ++----- packages/crawling/src/postByRelease.ts | 8 +-- packages/crawling/src/postTransDictionary.ts | 8 +-- packages/crawling/src/updateJpnSongs.ts | 9 ---- packages/crawling/src/utils/logData.ts | 50 ------------------ 10 files changed, 65 insertions(+), 97 deletions(-) create mode 100644 packages/crawling/src/crawling/crawlYoutubeVerify.ts diff --git a/packages/crawling/src/crawling/crawlRecentTJ.ts b/packages/crawling/src/crawling/crawlRecentTJ.ts index 8df2fa3..9cfbf65 100644 --- a/packages/crawling/src/crawling/crawlRecentTJ.ts +++ b/packages/crawling/src/crawling/crawlRecentTJ.ts @@ -5,7 +5,6 @@ import puppeteer from 'puppeteer'; import { postSongsDB } from '@/supabase/postDB'; import { LogData, Song } from '@/types'; -import { updateDataLog } from '@/utils/logData'; import { parseNumber } from '@/utils/parseNumber'; import { parseText } from '@/utils/parseString'; @@ -61,7 +60,6 @@ console.log('실패 개수 : ', result.failed.length); console.log('성공 데이터 : ', result.success); console.log('실패 데이터 : ', result.failed); -updateDataLog(result.success, 'postByRecentTJSuccess.txt'); -updateDataLog(result.failed, 'postByRecentTJFailed.txt'); + await browser.close(); diff --git a/packages/crawling/src/crawling/crawlYoutubeValid.ts b/packages/crawling/src/crawling/crawlYoutubeValid.ts index f9e166b..0bce9e0 100644 --- a/packages/crawling/src/crawling/crawlYoutubeValid.ts +++ b/packages/crawling/src/crawling/crawlYoutubeValid.ts @@ -3,7 +3,7 @@ import puppeteer from 'puppeteer'; import { getSongsKyNotNullDB } from '@/supabase/getDB'; import { updateSongsKyDB } from '@/supabase/updateDB'; import { Song } from '@/types'; -import { loadValidKYSongs, saveValidKYSongs, updateDataLog } from '@/utils/logData'; +import { loadValidKYSongs, saveValidKYSongs } from '@/utils/logData'; import { isValidKYExistNumber } from './isValidKYExistNumber'; @@ -14,9 +14,7 @@ const browser = await puppeteer.launch(); const page = await browser.newPage(); const updateData = async (data: Song) => { - const result = await updateSongsKyDB(data); - updateDataLog(result.success, 'crawlYoutubeSuccess.txt'); - updateDataLog(result.failed, 'crawlYoutubeFailed.txt'); + await updateSongsKyDB(data); }; const data = await getSongsKyNotNullDB(); diff --git a/packages/crawling/src/crawling/crawlYoutubeVerify.ts b/packages/crawling/src/crawling/crawlYoutubeVerify.ts new file mode 100644 index 0000000..0bce9e0 --- /dev/null +++ b/packages/crawling/src/crawling/crawlYoutubeVerify.ts @@ -0,0 +1,52 @@ +import puppeteer from 'puppeteer'; + +import { getSongsKyNotNullDB } from '@/supabase/getDB'; +import { updateSongsKyDB } from '@/supabase/updateDB'; +import { Song } from '@/types'; +import { loadValidKYSongs, saveValidKYSongs } from '@/utils/logData'; + +import { isValidKYExistNumber } from './isValidKYExistNumber'; + +// 기존에 등록된 KY 노래방 번호가 실제로 KY 노래방과 일치하는지 검증 +// crawlYoutube와는 다르게 끝이 정해진 작업 + +const browser = await puppeteer.launch(); +const page = await browser.newPage(); + +const updateData = async (data: Song) => { + await updateSongsKyDB(data); +}; + +const data = await getSongsKyNotNullDB(); +const vaildSongs = loadValidKYSongs(); + +console.log('getSongsKyNotNullDB : ', data.length); +let index = 0; + +for (const song of data) { + const query = song.title + '-' + song.artist; + + if (vaildSongs.has(query)) { + continue; + } + + console.log(song.title, ' - ', song.artist + ' : ', song.num_ky); + let isValid = true; + try { + isValid = await isValidKYExistNumber(page, song.num_ky, song.title, song.artist); + } catch (error) { + index++; + continue; + } + + if (!isValid) { + // stackData.push({ ...song, num_ky: null }); + // totalData.push({ ...song, num_ky: null }); + await updateData({ ...song, num_ky: null }); + } else saveValidKYSongs(song.title, song.artist); + + index++; + console.log('crawlYoutubeValid : ', index); +} + +browser.close(); diff --git a/packages/crawling/src/crawling/replaceSupabaseFailed.ts b/packages/crawling/src/crawling/replaceSupabaseFailed.ts index 247414e..fe14aa8 100644 --- a/packages/crawling/src/crawling/replaceSupabaseFailed.ts +++ b/packages/crawling/src/crawling/replaceSupabaseFailed.ts @@ -1,10 +1,10 @@ import { getSongsKyNullDB } from '@/supabase/getDB'; import { postInvalidKYSongsDB } from '@/supabase/postDB'; import { Song } from '@/types'; -import { loadCrawlYoutubeFailedKYSongs, loadValidKYSongs } from '@/utils/logData'; +import { loadCrawlYoutubeFailedKYSongs } from '@/utils/logData'; const data: Song[] = await getSongsKyNullDB(); -const failedSongs = loadValidKYSongs(); +const failedSongs = loadCrawlYoutubeFailedKYSongs(); console.log('getSongsKyNullDB : ', data.length); console.log('size : ', failedSongs.size); diff --git a/packages/crawling/src/findKYByOpen.ts b/packages/crawling/src/findKYByOpen.ts index d76c52e..5177f39 100644 --- a/packages/crawling/src/findKYByOpen.ts +++ b/packages/crawling/src/findKYByOpen.ts @@ -3,7 +3,6 @@ import { getSong } from '@repo/open-api'; import { getSongsKyNullDB } from '@/supabase/getDB'; import { updateSongsKyDB } from '@/supabase/updateDB'; import { Song } from '@/types'; -import { updateDataLog } from '@/utils/logData'; const resultsLog = { success: [] as Song[], @@ -78,5 +77,3 @@ console.log(` - 실패: ${resultsLog.failed.length}곡 `); -updateDataLog(resultsLog.success, 'findKYByOpenSuccess.txt'); -updateDataLog(resultsLog.failed, 'findKYByOpenFailed.txt'); diff --git a/packages/crawling/src/postAllOpenSongs.ts b/packages/crawling/src/postAllOpenSongs.ts index 0456f41..68956ce 100644 --- a/packages/crawling/src/postAllOpenSongs.ts +++ b/packages/crawling/src/postAllOpenSongs.ts @@ -4,8 +4,7 @@ import path from 'path'; import { getSong } from '@repo/open-api'; import { postSongsDB } from '@/supabase/postDB'; -import { LogData, Song } from '@/types'; -import { updateDataLog } from '@/utils/logData'; +import { Song } from '@/types'; const START_CODE = 0xac00; // '가' const END_CODE = 0xd7a3; // '힣' @@ -68,10 +67,7 @@ async function getHangulSongs() { num_ky: null, release: item.release === '0000-00-00' ? null : item.release, })); - const result: LogData = await postSongsDB(songs); - - updateDataLog(result.success, 'postByAllOpenSuccess.txt'); - updateDataLog(result.failed, 'postByAllOpenFailed.txt'); + await postSongsDB(songs); saveProgress(index); index++; @@ -99,10 +95,7 @@ async function getAlphaSongs() { num_ky: null, release: item.release === '0000-00-00' ? null : item.release, })); - const result: LogData = await postSongsDB(songs); - - updateDataLog(result.success, 'postByAllOpenSuccess.txt'); - updateDataLog(result.failed, 'postByAllOpenFailed.txt'); + await postSongsDB(songs); saveAlphaProgress(index); index++; @@ -130,10 +123,7 @@ async function getNumberSongs() { num_ky: null, release: item.release === '0000-00-00' ? null : item.release, })); - const result: LogData = await postSongsDB(songs); - - updateDataLog(result.success, 'postByAllOpenSuccess.txt'); - updateDataLog(result.failed, 'postByAllOpenFailed.txt'); + await postSongsDB(songs); saveAlphaProgress(index); index++; diff --git a/packages/crawling/src/postByRelease.ts b/packages/crawling/src/postByRelease.ts index 1e4d469..2a747bd 100644 --- a/packages/crawling/src/postByRelease.ts +++ b/packages/crawling/src/postByRelease.ts @@ -1,8 +1,7 @@ import { getRelease } from '@repo/open-api'; import { postSongsDB } from '@/supabase/postDB'; -import { LogData, Song } from '@/types'; -import { updateDataLog } from '@/utils/logData'; +import { Song } from '@/types'; const parseMonth = (month: number) => { return month < 10 ? `0${month}` : month; @@ -38,7 +37,4 @@ console.log('songs', songs.length); // TJ 2007~2025 38519곡 -const result: LogData = await postSongsDB(songs); - -updateDataLog(result.success, 'postByReleaseSuccess.txt'); -updateDataLog(result.failed, 'postByReleaseFailed.txt'); +await postSongsDB(songs); diff --git a/packages/crawling/src/postTransDictionary.ts b/packages/crawling/src/postTransDictionary.ts index 7457a6b..318f229 100644 --- a/packages/crawling/src/postTransDictionary.ts +++ b/packages/crawling/src/postTransDictionary.ts @@ -3,7 +3,7 @@ import { sleep } from 'openai/core'; import { getSongsJpnDB, getTransDictionariesDBByOriginal } from '@/supabase/getDB'; import { postTransDictionariesDB } from '@/supabase/postDB'; import { TransDictionary, TransSong } from '@/types'; -import { loadDictionariesLog, saveDictionariesLog, updateDataLog } from '@/utils/logData'; +import { loadDictionariesLog, saveDictionariesLog } from '@/utils/logData'; import { transChatGPT } from '@/utils/transChatGPT'; const data: TransSong[] = await getSongsJpnDB(); @@ -18,15 +18,11 @@ const transData: TransDictionary[] = []; const refreshData = async () => { console.log('refreshData'); - const result = await postTransDictionariesDB(transData); + await postTransDictionariesDB(transData); for (const song of transData) { saveDictionariesLog(song.original_japanese); } - updateDataLog(result.success, 'postTransDictionarySuccess.txt'); - updateDataLog(result.failed, 'postTransDictionaryFailed.txt'); - unknownData.length > 0 && updateDataLog(unknownData, 'postTransDictionaryUnknown.txt'); - transData.length = 0; unknownData.length = 0; }; diff --git a/packages/crawling/src/updateJpnSongs.ts b/packages/crawling/src/updateJpnSongs.ts index 347ed67..a43a5de 100644 --- a/packages/crawling/src/updateJpnSongs.ts +++ b/packages/crawling/src/updateJpnSongs.ts @@ -3,7 +3,6 @@ import { sleep } from 'openai/core'; import { getSongsJpnDB } from '@/supabase/getDB'; import { updateSongsJpnDB } from '@/supabase/updateDB'; import { TransSong } from '@/types'; -import { updateDataLog } from '@/utils/logData'; import { transChatGPT } from '@/utils/transChatGPT'; const data = await getSongsJpnDB(); @@ -55,11 +54,3 @@ for (const song of transData) { } } -// 만약 unknownData가 있다면 해당 데이터를 배열에 담아서 끝났을 때 error.txt에 저장 -if (unknownData.length > 0) { - updateDataLog(unknownData, 'errorLog.txt'); -} - -if (transData.length > 0) { - updateDataLog(transData, 'transDataLog.txt'); -} diff --git a/packages/crawling/src/utils/logData.ts b/packages/crawling/src/utils/logData.ts index 7a01d8d..e326647 100644 --- a/packages/crawling/src/utils/logData.ts +++ b/packages/crawling/src/utils/logData.ts @@ -1,47 +1,6 @@ import fs from 'fs'; import path from 'path'; -export function updateDataLog(unknownData: T[] | T, filename: string) { - if (!unknownData) return; - if (unknownData instanceof Array && unknownData.length === 0) return; - - const now = new Date(); - const timeString = now.toLocaleString('ko-KR', { - timeZone: 'Asia/Seoul', - year: 'numeric', - month: '2-digit', - day: '2-digit', - hour: '2-digit', - minute: '2-digit', - second: '2-digit', - hour12: false, - }); - - const logPath = path.join('log', filename); - const logDir = path.dirname(logPath); // 디렉터리 경로 추출 - - // 디렉터리가 없으면 생성 - if (!fs.existsSync(logDir)) { - fs.mkdirSync(logDir, { recursive: true }); - } - - if (unknownData instanceof Array) { - // 로그 문자열 생성 - const logString = - `\n[${timeString}]\n` + - `\n[총 ${unknownData.length}개의 데이터]\n` + - unknownData.map(item => JSON.stringify(item)).join('\n') + - '\n'; - - fs.appendFileSync(logPath, logString, 'utf-8'); - } else { - // 로그 문자열 생성 - const logString = `\n[${timeString}]\n` + JSON.stringify(unknownData) + '\n'; - - fs.appendFileSync(logPath, logString, 'utf-8'); - } -} - export function saveDictionariesLog(japanese: string) { const logPath = path.join('src', 'assets', 'transList.txt'); const logDir = path.dirname(logPath); @@ -62,15 +21,6 @@ export function loadDictionariesLog(): Set { return new Set(lines); } -export function saveCrawlYoutubeFailedKYSongs(title: string, artist: string) { - const logPath = path.join('src', 'assets', 'crawlKYYoutubeFailedList.txt'); - const logDir = path.dirname(logPath); - if (!fs.existsSync(logDir)) { - fs.mkdirSync(logDir, { recursive: true }); - } - fs.appendFileSync(logPath, `${title}-${artist}\n`, 'utf-8'); -} - export function loadCrawlYoutubeFailedKYSongs(): Set { const logPath = path.join('src', 'assets', 'crawlKYYoutubeFailedList.txt'); if (!fs.existsSync(logPath)) return new Set(); From 928d98b7a764c3f6c5b2ab20ac494f0a0f75cbc0 Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Sun, 8 Mar 2026 18:21:38 +0900 Subject: [PATCH 5/7] =?UTF-8?q?doc=20:=20packages/crawling=20CLAUDE.md=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- packages/crawling/CLAUDE.md | 111 ++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 packages/crawling/CLAUDE.md diff --git a/packages/crawling/CLAUDE.md b/packages/crawling/CLAUDE.md new file mode 100644 index 0000000..832b531 --- /dev/null +++ b/packages/crawling/CLAUDE.md @@ -0,0 +1,111 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Overview + +일회성 데이터 수집/처리 스크립트 모음. 빌드 결과물을 배포하지 않으며, `tsx`로 스크립트를 직접 실행한다. + +## Commands + +```bash +pnpm ky-open # Open API(금영)로 KY 번호 수집 +pnpm ky-youtube # YouTube 크롤링으로 KY 번호 수집 + AI 검증 +pnpm ky-valid # 기존 KY 번호의 실제 존재 여부 재검증 +pnpm ky-update # ky-youtube + ky-valid 병렬 실행 +pnpm trans # 일본어 아티스트명 → 한국어 번역 후 DB 저장 +pnpm test # vitest 실행 +pnpm lint # ESLint +``` + +스크립트는 반드시 **`packages/crawling/`** 디렉토리에서 실행해야 한다. 로그 파일 및 assets 경로가 상대 경로 기준이기 때문. + +## Environment Variables + +`.env` 파일 필요 (루트가 아닌 `packages/crawling/`에 위치): + +``` +SUPABASE_URL= +SUPABASE_KEY= +OPENAI_API_KEY= +``` + +## Architecture + +### 데이터 흐름 + +모든 스크립트는 **Supabase `songs` 테이블**을 중심으로 동작한다. + +``` +[songs 테이블] + title, artist, num_tj(TJ번호), num_ky(KY번호) + +주요 목표: num_ky가 null인 곡에 KY 번호를 채우는 것 +``` + +**KY 번호 수집 (메인 파이프라인)** + +``` +crawlYoutube.ts + └─ getSongsKyNullDB() # num_ky가 null인 곡 조회 + └─ YouTube @KARAOKEKY 채널 검색 # puppeteer + cheerio로 번호 스크래핑 + └─ isValidKYExistNumber() # kysing.kr에서 번호 실존 여부 확인 + └─ validateSongMatch() # OpenAI gpt-4o-mini로 제목/아티스트 일치 판단 + └─ updateSongsKyDB() # 성공 시 DB 업데이트 + └─ postInvalidKYSongsDB() # 실패 시 invalid_ky_songs 테이블에 기록 +``` + +**KY 번호 검증 (기존 데이터 재확인)** + +``` +crawlYoutubeValid.ts + └─ getSongsKyNotNullDB() # num_ky가 있는 곡 조회 + └─ isValidKYExistNumber() # KY 사이트에서 실존 여부 재확인 + └─ 유효하지 않으면 num_ky = null로 초기화 +``` + +**Open API 방식 (보조)** + +``` +findKYByOpen.ts + └─ @repo/open-api의 getSong()으로 금영 API 직접 조회 + └─ 제목 + 아티스트 문자열 비교로 KY 번호 매칭 +``` + +**일본어 번역** + +``` +postTransDictionary.ts + └─ getSongsJpnDB() # 일본어 포함된 곡 필터링 + └─ transChatGPT() # GPT-4-turbo로 아티스트명 번역 + └─ postTransDictionariesDB() # trans_dictionaries 테이블에 저장 +``` + +### 핵심 패턴: 진행 상태 저장 (체크포인트) + +장시간 실행되는 스크립트가 중단됐을 때 재시작하면 처음부터 다시 하지 않도록, `src/assets/`에 텍스트 파일로 진행 상태를 기록한다. + +| 파일 | 용도 | +|------|------| +| `src/assets/transList.txt` | 이미 번역 시도한 일본어 아티스트명 | +| `src/assets/crawlKYValidList.txt` | 검증 완료된 (제목-아티스트) 쌍 | +| `src/assets/crawlKYYoutubeFailedList.txt` | YouTube 크롤링 실패 목록 | + +`logData.ts`의 `save*` / `load*` 함수로 관리. 스크립트 시작 시 로드해 `Set`으로 변환 후 O(1) 검색으로 스킵 처리. + +### Path Alias + +`@/` → `src/` (tsconfig의 paths 설정) + +### Supabase 테이블 + +| 테이블 | 용도 | +|--------|------| +| `songs` | 메인 곡 데이터 (TJ/KY 번호 포함) | +| `invalid_ky_songs` | KY 번호 수집 실패 목록 | +| `trans_dictionaries` | 일본어 → 한국어 번역 사전 | + +### AI 유틸 + +- `utils/validateSongMatch.ts` — `gpt-4o-mini`로 두 (제목, 아티스트) 쌍이 같은 곡인지 판단. `temperature: 0`, `max_tokens: 20`, 완전 일치 시 API 호출 생략. +- `utils/transChatGPT.ts` — `gpt-4-turbo`로 일본어 → 한국어 번역. From b654aa804144a3b179386d8fc56ddbc289721e13 Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Sun, 8 Mar 2026 18:21:55 +0900 Subject: [PATCH 6/7] =?UTF-8?q?refactor=20:=20crawlYoutubeVerify=20?= =?UTF-8?q?=ED=8C=8C=EC=9D=BC=20=EC=B2=B4=ED=81=AC=ED=8F=AC=EC=9D=B8?= =?UTF-8?q?=ED=8A=B8=EB=A5=BC=20verify=5Fky=5Fsongs=20DB=20=EA=B8=B0?= =?UTF-8?q?=EB=B0=98=EC=9C=BC=EB=A1=9C=20=EC=A0=84=ED=99=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- packages/crawling/package.json | 3 +- .../src/crawling/crawlYoutubeVerify.ts | 30 ++++++++----------- packages/crawling/src/supabase/getDB.ts | 10 +++++++ packages/crawling/src/supabase/postDB.ts | 18 ++++++++++- 4 files changed, 41 insertions(+), 20 deletions(-) diff --git a/packages/crawling/package.json b/packages/crawling/package.json index 92b0417..05d202f 100644 --- a/packages/crawling/package.json +++ b/packages/crawling/package.json @@ -10,7 +10,8 @@ "ky-open": "tsx src/findKYByOpen.ts", "ky-youtube": "tsx src/crawling/crawlYoutube.ts", "ky-valid": "tsx src/crawling/crawlYoutubeValid.ts", - "ky-update": "pnpm run ky-youtube & pnpm run ky-valid", + "ky-verify": "tsx src/crawling/crawlYoutubeVerify.ts", + "ky-update": "pnpm run ky-youtube & pnpm run ky-valid & pnpm run ky-verify", "trans": "tsx src/postTransDictionary.ts", "recent-tj": "tsx src/crawling/crawlRecentTJ.ts", "lint": "eslint . --ext .ts,.js", diff --git a/packages/crawling/src/crawling/crawlYoutubeVerify.ts b/packages/crawling/src/crawling/crawlYoutubeVerify.ts index 0bce9e0..0ce5653 100644 --- a/packages/crawling/src/crawling/crawlYoutubeVerify.ts +++ b/packages/crawling/src/crawling/crawlYoutubeVerify.ts @@ -1,32 +1,26 @@ import puppeteer from 'puppeteer'; -import { getSongsKyNotNullDB } from '@/supabase/getDB'; +import { getSongsKyNotNullDB, getVerifyKySongsDB } from '@/supabase/getDB'; +import { postVerifyKySongsDB } from '@/supabase/postDB'; import { updateSongsKyDB } from '@/supabase/updateDB'; -import { Song } from '@/types'; -import { loadValidKYSongs, saveValidKYSongs } from '@/utils/logData'; import { isValidKYExistNumber } from './isValidKYExistNumber'; // 기존에 등록된 KY 노래방 번호가 실제로 KY 노래방과 일치하는지 검증 -// crawlYoutube와는 다르게 끝이 정해진 작업 +// 유효한 곡은 verify_ky_songs 테이블에 insert const browser = await puppeteer.launch(); const page = await browser.newPage(); -const updateData = async (data: Song) => { - await updateSongsKyDB(data); -}; - const data = await getSongsKyNotNullDB(); -const vaildSongs = loadValidKYSongs(); +const verifiedIds = await getVerifyKySongsDB(); console.log('getSongsKyNotNullDB : ', data.length); +console.log('이미 검증된 곡 수 : ', verifiedIds.size); let index = 0; for (const song of data) { - const query = song.title + '-' + song.artist; - - if (vaildSongs.has(query)) { + if (verifiedIds.has(song.id!)) { continue; } @@ -39,14 +33,14 @@ for (const song of data) { continue; } - if (!isValid) { - // stackData.push({ ...song, num_ky: null }); - // totalData.push({ ...song, num_ky: null }); - await updateData({ ...song, num_ky: null }); - } else saveValidKYSongs(song.title, song.artist); + if (isValid) { + await postVerifyKySongsDB(song); + } else { + await updateSongsKyDB({ ...song, num_ky: null }); + } index++; - console.log('crawlYoutubeValid : ', index); + console.log('crawlYoutubeVerify : ', index); } browser.close(); diff --git a/packages/crawling/src/supabase/getDB.ts b/packages/crawling/src/supabase/getDB.ts index 20906bf..abeb58e 100644 --- a/packages/crawling/src/supabase/getDB.ts +++ b/packages/crawling/src/supabase/getDB.ts @@ -101,3 +101,13 @@ export async function getInvalidKYSongsDB(): Promise< return data; } + +export async function getVerifyKySongsDB(): Promise> { + const supabase = getClient(); + + const { data, error } = await supabase.from('verify_ky_songs').select('id'); + + if (error) throw error; + + return new Set(data.map(row => row.id)); +} diff --git a/packages/crawling/src/supabase/postDB.ts b/packages/crawling/src/supabase/postDB.ts index 6197103..72e5cc2 100644 --- a/packages/crawling/src/supabase/postDB.ts +++ b/packages/crawling/src/supabase/postDB.ts @@ -72,12 +72,28 @@ export async function postTransDictionariesDB(dictionaries: TransDictionary[]) { return results; } +export async function postVerifyKySongsDB(song: Song) { + const supabase = getClient(); + + try { + const { id, title, artist } = song; + const { error } = await supabase.from('verify_ky_songs').insert({ id, title, artist }).select(); + if (error) { + console.error('postVerifyKySongsDB error : ', error); + } + return true; + } catch (error) { + console.error('catch - postVerifyKySongsDB error : ', error); + return error; + } +} + export async function postInvalidKYSongsDB(song: Song) { const supabase = getClient(); try { const { id, title, artist } = song; - const { data, error } = await supabase + const { error } = await supabase .from('invalid_ky_songs') .insert({ id, title, artist }) .select(); From f183afb545977584b9686661fe1ca447256085e3 Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Sun, 8 Mar 2026 18:22:04 +0900 Subject: [PATCH 7/7] =?UTF-8?q?refactor=20:=20logData=20=ED=8C=8C=EC=9D=BC?= =?UTF-8?q?=20=EC=B2=B4=ED=81=AC=ED=8F=AC=EC=9D=B8=ED=8A=B8=20=EC=82=AC?= =?UTF-8?q?=EC=9A=A9=20=EC=A3=BC=EC=84=9D=EC=B2=98=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../src/crawling/crawlYoutubeValid.ts | 14 ++++++------- .../src/crawling/isValidKYExistNumber.ts | 5 +++-- .../src/crawling/replaceSupabaseFailed.ts | 21 +++++++++---------- packages/crawling/src/postTransDictionary.ts | 18 ++++++++-------- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/packages/crawling/src/crawling/crawlYoutubeValid.ts b/packages/crawling/src/crawling/crawlYoutubeValid.ts index 0bce9e0..c74ad57 100644 --- a/packages/crawling/src/crawling/crawlYoutubeValid.ts +++ b/packages/crawling/src/crawling/crawlYoutubeValid.ts @@ -3,7 +3,7 @@ import puppeteer from 'puppeteer'; import { getSongsKyNotNullDB } from '@/supabase/getDB'; import { updateSongsKyDB } from '@/supabase/updateDB'; import { Song } from '@/types'; -import { loadValidKYSongs, saveValidKYSongs } from '@/utils/logData'; +// import { loadValidKYSongs, saveValidKYSongs } from '@/utils/logData'; import { isValidKYExistNumber } from './isValidKYExistNumber'; @@ -18,17 +18,17 @@ const updateData = async (data: Song) => { }; const data = await getSongsKyNotNullDB(); -const vaildSongs = loadValidKYSongs(); +// const vaildSongs = loadValidKYSongs(); console.log('getSongsKyNotNullDB : ', data.length); let index = 0; for (const song of data) { - const query = song.title + '-' + song.artist; + // const query = song.title + '-' + song.artist; - if (vaildSongs.has(query)) { - continue; - } + // if (vaildSongs.has(query)) { + // continue; + // } console.log(song.title, ' - ', song.artist + ' : ', song.num_ky); let isValid = true; @@ -43,7 +43,7 @@ for (const song of data) { // stackData.push({ ...song, num_ky: null }); // totalData.push({ ...song, num_ky: null }); await updateData({ ...song, num_ky: null }); - } else saveValidKYSongs(song.title, song.artist); + } // else saveValidKYSongs(song.title, song.artist); index++; console.log('crawlYoutubeValid : ', index); diff --git a/packages/crawling/src/crawling/isValidKYExistNumber.ts b/packages/crawling/src/crawling/isValidKYExistNumber.ts index 3b8dc58..1de92df 100644 --- a/packages/crawling/src/crawling/isValidKYExistNumber.ts +++ b/packages/crawling/src/crawling/isValidKYExistNumber.ts @@ -1,5 +1,6 @@ import * as cheerio from 'cheerio'; import { Page } from 'puppeteer'; + import { validateSongMatch } from '../utils/validateSongMatch'; export const isValidKYExistNumber = async ( @@ -27,8 +28,8 @@ export const isValidKYExistNumber = async ( return false; } - console.log(`검색 쿼리 : ${title} - ${artist}`); - console.log(`KY 노래방 검색 결과 : ${titleResult} - ${artistResult}`); + console.log(`(TJ 기준) 검색 쿼리 : ${title} - ${artist}`); + console.log(`(KY 노래방) 검색 결과 : ${titleResult} - ${artistResult}`); const isValid = await validateSongMatch(title, artist, titleResult, artistResult); diff --git a/packages/crawling/src/crawling/replaceSupabaseFailed.ts b/packages/crawling/src/crawling/replaceSupabaseFailed.ts index fe14aa8..3d9a163 100644 --- a/packages/crawling/src/crawling/replaceSupabaseFailed.ts +++ b/packages/crawling/src/crawling/replaceSupabaseFailed.ts @@ -1,26 +1,25 @@ import { getSongsKyNullDB } from '@/supabase/getDB'; import { postInvalidKYSongsDB } from '@/supabase/postDB'; import { Song } from '@/types'; -import { loadCrawlYoutubeFailedKYSongs } from '@/utils/logData'; +// import { loadCrawlYoutubeFailedKYSongs } from '@/utils/logData'; const data: Song[] = await getSongsKyNullDB(); -const failedSongs = loadCrawlYoutubeFailedKYSongs(); +// const failedSongs = loadCrawlYoutubeFailedKYSongs(); console.log('getSongsKyNullDB : ', data.length); -console.log('size : ', failedSongs.size); +// console.log('size : ', failedSongs.size); let index = 0; let successCount = 0; for (const song of data) { - const query = song.title + '-' + song.artist; + // const query = song.title + '-' + song.artist; - if (failedSongs.has(query)) { - console.log('post song : ', song); - await postInvalidKYSongsDB(song); - successCount++; - - continue; - } + // if (failedSongs.has(query)) { + // console.log('post song : ', song); + // await postInvalidKYSongsDB(song); + // successCount++; + // continue; + // } index++; } diff --git a/packages/crawling/src/postTransDictionary.ts b/packages/crawling/src/postTransDictionary.ts index 318f229..7bcb59e 100644 --- a/packages/crawling/src/postTransDictionary.ts +++ b/packages/crawling/src/postTransDictionary.ts @@ -3,7 +3,7 @@ import { sleep } from 'openai/core'; import { getSongsJpnDB, getTransDictionariesDBByOriginal } from '@/supabase/getDB'; import { postTransDictionariesDB } from '@/supabase/postDB'; import { TransDictionary, TransSong } from '@/types'; -import { loadDictionariesLog, saveDictionariesLog } from '@/utils/logData'; +// import { loadDictionariesLog, saveDictionariesLog } from '@/utils/logData'; import { transChatGPT } from '@/utils/transChatGPT'; const data: TransSong[] = await getSongsJpnDB(); @@ -19,9 +19,9 @@ const refreshData = async () => { console.log('refreshData'); await postTransDictionariesDB(transData); - for (const song of transData) { - saveDictionariesLog(song.original_japanese); - } + // for (const song of transData) { + // saveDictionariesLog(song.original_japanese); + // } transData.length = 0; unknownData.length = 0; @@ -29,7 +29,7 @@ const refreshData = async () => { let count = 0; -const tryLogs = loadDictionariesLog(); +// const tryLogs = loadDictionariesLog(); for (const song of data) { if (count >= 10) { @@ -39,14 +39,14 @@ for (const song of data) { console.log('count : ', count++); await sleep(150); // 0.15초(150ms) 대기 - if (tryLogs.has(song.artist)) { - continue; - } + // if (tryLogs.has(song.artist)) { + // continue; + // } const dupArtistTrans = await getTransDictionariesDBByOriginal(song.artist); if (dupArtistTrans) { - saveDictionariesLog(song.artist); + // saveDictionariesLog(song.artist); continue; }