In [None]:
import pandas as pd
from collections import Counter
import re

# CSVファイルの読み込み
tracks_file = './output/split_csv_files/tracks_rapper_part1.csv'
lyrics_file = './output/lyrics_output1.csv'

tracks_df = pd.read_csv(tracks_file)
lyrics_df = pd.read_csv(lyrics_file)

# データフレームの結合
merged_df = pd.merge(tracks_df, lyrics_df, on=['track_id', 'track_name'], how='inner')

# 歌詞中の単語頻出度を計算する関数
def calculate_word_frequency(lyrics):
    if pd.isna(lyrics):
        return {}
    words = re.findall(r'\b\w+\b', lyrics.lower())
    return dict(Counter(words))

# 単語頻出度を新しいカラムに追加
merged_df['word_frequency'] = merged_df['song_lyrics'].apply(calculate_word_frequency)

# 必要なカラムの抽出
result_df = merged_df[['track_id', 'track_name', 'artist_id', 'artist_name', 'word_frequency']]

# 単語頻出度の詳細を展開（オプション: 列ごとに分ける）
all_words = set(word for freq_dict in result_df['word_frequency'] for word in freq_dict)
for word in all_words:
    result_df[word] = result_df['word_frequency'].apply(lambda freq: freq.get(word, 0))

# 不要なカラムの削除
result_df = result_df.drop(columns=['word_frequency'])

# CSVファイルに保存
output_file = 'tracks_with_word_frequency.csv'
result_df.to_csv(output_file, index=False)

print(f"処理が完了しました。結果は {output_file} に保存されています。")


# 12/16時点での単語頻出度のデータ作成

In [3]:
# Import libraries
import pandas as pd
from collections import Counter
import os

# Load the data
lyrics_file = "./output/lyrics_output_all.csv"  # Replace with your file path
tracks_file = "./wrapper_data/tracks_rapper.csv"  # Replace with your file path

# Read CSV files
lyrics_df = pd.read_csv(lyrics_file)
tracks_df = pd.read_csv(tracks_file)

# Merge the data on track_id
merged_df = pd.merge(tracks_df, lyrics_df, on="track_id", how="left")

# Check for missing lyrics
print(f"Missing lyrics: {merged_df['song_lyrics'].isna().sum()}")

# Fill missing lyrics with empty strings for processing
merged_df['song_lyrics'] = merged_df['song_lyrics'].fillna("")

# Function to process and count word frequencies
def count_word_frequencies(text):
    # Lowercase and split text into words
    words = text.lower().split()
    # Count frequencies
    return Counter(words)

# Group by artist_id
artist_groups = merged_df.groupby("artist_id")

# Create output directory for CSVs
output_dir = "lyrics_by_artist"
os.makedirs(output_dir, exist_ok=True)

# Process each artist group
for artist_id, group in artist_groups:
    # Combine all lyrics for the artist
    all_lyrics = " ".join(group["song_lyrics"])
    # Count word frequencies
    word_counts = count_word_frequencies(all_lyrics)
    # Convert to DataFrame
    word_counts_df = pd.DataFrame(word_counts.items(), columns=["word", "frequency"])
    # Sort by frequency
    word_counts_df = word_counts_df.sort_values(by="frequency", ascending=False)
    # Save to CSV
    output_file = os.path.join(output_dir, f"lyrics_{artist_id}.csv")
    word_counts_df.to_csv(output_file, index=False)
    print(f"Saved word counts for artist_id {artist_id} to {output_file}")

print("Processing complete.")


Missing lyrics: 8926
Saved word counts for artist_id 0Y5tJX1MQlPlqiwlOH1tJY to lyrics_by_artist/lyrics_0Y5tJX1MQlPlqiwlOH1tJY.csv
Saved word counts for artist_id 0c173mlxpT3dSFRgMO8XPh to lyrics_by_artist/lyrics_0c173mlxpT3dSFRgMO8XPh.csv
Saved word counts for artist_id 0hCNtLu0JehylgoiP8L4Gh to lyrics_by_artist/lyrics_0hCNtLu0JehylgoiP8L4Gh.csv
Saved word counts for artist_id 137W8MRPWKqSmrBGDBFSop to lyrics_by_artist/lyrics_137W8MRPWKqSmrBGDBFSop.csv
Saved word counts for artist_id 13ubrt8QOOCPljQ2FL1Kca to lyrics_by_artist/lyrics_13ubrt8QOOCPljQ2FL1Kca.csv
Saved word counts for artist_id 181bsRPaVXVlUKXrxwZfHK to lyrics_by_artist/lyrics_181bsRPaVXVlUKXrxwZfHK.csv
Saved word counts for artist_id 1RyvyyTE3xzB2ZywiAwp0i to lyrics_by_artist/lyrics_1RyvyyTE3xzB2ZywiAwp0i.csv
Saved word counts for artist_id 1URnnhqYAYcrqrcwql10ft to lyrics_by_artist/lyrics_1URnnhqYAYcrqrcwql10ft.csv
Saved word counts for artist_id 1ZwdS5xdxEREPySFridCfh to lyrics_by_artist/lyrics_1ZwdS5xdxEREPySFridCfh.cs

# tracks_rapper_with_lyricsからの歌詞取得

In [None]:
print("Processing complete.")

In [2]:
from collections import Counter
import pandas as pd
import os
import re

In [11]:
import os
import pandas as pd
import re
from collections import Counter

# 入力ファイル名
input_file = "./output/tracks_rapper_with_lyrics.csv"

# 出力先ディレクトリを作成
output_dir = "artist_lyrics_counts"
os.makedirs(output_dir, exist_ok=True)

# CSVファイルを読み込み
df = pd.read_csv(input_file)

# 歌詞がNaNの場合は空文字列に変換
df['lyrics'] = df['lyrics'].fillna("")

# アーティストごとにデータをグループ化
artist_groups = df.groupby('artist_id')

# 除外する単語のリスト（冠詞、代名詞、動詞など）
stop_words = set([
    # 冠詞
    'a', 'an', 'the',
    
    # 代名詞
    'he', 'she', 'it', 'they', 'your', 'you', 'i', 'we', 'his', 'her', 'their', 'its',
    'me', 'him', 'them', 'us', 'my', 'mine', 'yours', 'ours', 'hers', 'theirs',
    'that', 'this', 'these', 'those', 'who', 'whom', 'which', 'what',
    
    # 一般的な動詞（be動詞）
    'be', 'am', 'is', 'are', 'was', 'were', 'been', 'being',
    
    # 一般的な動詞（その他）
    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'shall', 'should',
    'can', 'could', 'may', 'might', 'must', 'get', 'got', 'go', 'goes', 'gone',
    'going', 'see', 'saw', 'seen', 'make', 'made', 'making', 'say', 'said', 'saying',
    'take', 'took', 'taken', 'taking', 'come', 'came', 'coming', 'think', 'thought',
    'know', 'knew', 'known', 'knowing', 'want', 'wanted', 'wanting', 'give', 'gave',
    'given', 'giving', 'use', 'used', 'using', 'find', 'found', 'finding', 'tell',
    'told', 'telling', 'ask', 'asked', 'asking', 'work', 'worked', 'working', 'call',
    'called', 'calling', 'try', 'tried', 'trying', 'leave', 'left', 'leaving',
    
    # その他の助動詞や動詞
    'need', 'ought', 'dare', 'used',
    
    # 前置詞
    'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around',
    'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond',
    'by', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'inside',
    'into', 'near', 'of', 'off', 'on', 'onto', 'out', 'outside', 'over',
    'past', 'through', 'throughout', 'to', 'toward', 'under', 'underneath',
    'until', 'up', 'upon', 'with', 'within', 'without',
    
    # 接続詞
    'and', 'or', 'but', 'because', 'so', 'yet',
    
    # 感嘆詞
    'oh', 'wow', 'hey', 'hmm',
    
    # その他不要な単語
    'yeah', 'yeahs', 'uh', 'um', 'like', 'just',
    
    # 一般的すぎる名詞
    'thing', 'things', 'time', 'year', 'day', 'people', 'way', 'work',
    'world', 'life', 'hand', 'part', 'child', 'eyes', 'music', 'song',
    'love', 'heart', 'night', 'dream', 'mind', 'light', 'dark', 'voice',
    'sound', 'feel', 'feels', 'feeling', 'feelings', 'hope', 'hopes', 'hopeful',
    'hopefulness', 'hopefully',
    
    # 数字やその他不要な単語
    'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
])

for artist_id, group in artist_groups:
    # 全曲の歌詞を結合
    all_lyrics = " ".join(group['lyrics'])

    # テキストを小文字化
    all_lyrics = all_lyrics.lower()

    # 単語分割（正規表現を使ってアルファベットと数字以外は区切りとみなす）
    words = re.findall(r'\b[a-z]+\b', all_lyrics)

    # ストップワードを除外して単語をカウント
    nouns = [word for word in words if word not in stop_words]

    # 単語数カウント
    word_counts = Counter(nouns)

    # カウント結果をDataFrameへ
    word_counts_df = pd.DataFrame(word_counts.items(), columns=['word', 'frequency'])

    # 出現頻度でソート（降順）
    word_counts_df = word_counts_df.sort_values(by='frequency', ascending=False)

    # CSVファイルとして出力
    output_file = os.path.join(output_dir, f"{artist_id}_lyrics.csv")
    word_counts_df.to_csv(output_file, index=False)

    print(f"Saved word counts for artist_id {artist_id} to {output_file}")


Saved word counts for artist_id 0Y5tJX1MQlPlqiwlOH1tJY to artist_lyrics_counts/0Y5tJX1MQlPlqiwlOH1tJY_lyrics.csv
Saved word counts for artist_id 0c173mlxpT3dSFRgMO8XPh to artist_lyrics_counts/0c173mlxpT3dSFRgMO8XPh_lyrics.csv
Saved word counts for artist_id 0hCNtLu0JehylgoiP8L4Gh to artist_lyrics_counts/0hCNtLu0JehylgoiP8L4Gh_lyrics.csv
Saved word counts for artist_id 137W8MRPWKqSmrBGDBFSop to artist_lyrics_counts/137W8MRPWKqSmrBGDBFSop_lyrics.csv
Saved word counts for artist_id 13ubrt8QOOCPljQ2FL1Kca to artist_lyrics_counts/13ubrt8QOOCPljQ2FL1Kca_lyrics.csv
Saved word counts for artist_id 181bsRPaVXVlUKXrxwZfHK to artist_lyrics_counts/181bsRPaVXVlUKXrxwZfHK_lyrics.csv
Saved word counts for artist_id 1RyvyyTE3xzB2ZywiAwp0i to artist_lyrics_counts/1RyvyyTE3xzB2ZywiAwp0i_lyrics.csv
Saved word counts for artist_id 1URnnhqYAYcrqrcwql10ft to artist_lyrics_counts/1URnnhqYAYcrqrcwql10ft_lyrics.csv
Saved word counts for artist_id 1ZwdS5xdxEREPySFridCfh to artist_lyrics_counts/1ZwdS5xdxEREPySFr

# 単語の属性判断

In [2]:
# ライブラリのインポート
import os
import pandas as pd

# カテゴリ単語リストの定義
def get_word_lists():
    money_words = [
        "paper", "dough", "bread", "cream", "cheddar", "cheese", "gouda", "loot", "chips", "stack",
        "rack", "band", "guap", "gwap", "scrilla", "scratch", "ends", "fetti", "mula", "moolah",
        "bucks", "franklins", "benjamins", "ducats", "funds", "cake", "greenbacks", "notes", "knot",
        "roll", "stash", "pesos", "gwalla", "grip", "cabbage", "lettuce", "broccoli", "bankroll", "bones",
        "clams", "spondulicks", "wampum", "bank", "bag", "currency", "bean", "buck", "clam", "fin",
        "bill", "note", "hunnids", "mazuma", "simoleons", "dinero", "quid", "gwop", "guwop", "frogskins",
        "butter", "smackers", "buckaroos", "fins", "green", "yaper", "chavo", "plata", "skrilla", "munny",
        "lolly", "bandz", "chedda", "cheeze", "munnie", "ackers", "nicker", "bee", "score", "monkey",
        "pony", "bred", "bigfaces", "orchard", "chits", "chit", "banknote", "pays", "mints", "units",
        "digits", "stacks", "racks", "bands", "cakes", "fetty", "scrill", "papes", "fetta", "cheddah",
        "mulaa", "guala", "rhino"
    ]
    women_words = [
        "shawty", "shorty", "boo", "bae", "ma", "mami", "mommy", "bitch", "hoe", "ho", "chick",
        "broad", "bird", "wifey", "baby", "honey", "hun", "dime", "baddie", "hottie", "fox", "miss",
        "gyal", "chica", "senorita", "mija", "mamacita", "lady", "queen", "goddess", "dimepiece",
        "honeydip", "biddy", "girlie", "chickadee", "shortie", "thottie", "thot", "groupie", "breezy",
        "shordy", "babygirl", "ripper", "jawn", "joint", "kitten", "cougar", "freak", "slut", "skank",
        "cutie", "cutiepie", "femme", "gal", "homegirl", "chiquita", "stallion", "redbone", "yellowbone",
        "snowbunny", "hunnie", "babe", "babes", "gyaldem", "babymama", "shordie", "shotty", "roni",
        "model", "doll", "angel", "vixen", "princess", "duchess", "shortcake", "bunny", "missy", "lass",
        "maiden", "sister", "wench", "chippy", "videovixen", "thickums", "thickie", "cutey", "shortee",
        "flygirl", "shawt", "shawtina", "shawtee", "shottie", "dame", "queenpin", "foxxy", "ma’am",
        "hon", "babycakes", "sugar", "mamita"
    ]
    drugs_words = [
        "weed", "bud", "green", "grass", "tree", "loud", "ganja", "kush", "purp", "haze", "chronic",
        "indo", "skunk", "maryjane", "herb", "reefer", "zaza", "pack", "gas", "shrooms", "caps",
        "boomers", "molly", "ecstasy", "beans", "rolls", "lean", "wok", "juice", "xanax", "xanny",
        "bars", "percs", "oxy", "roxies", "vic", "codeine", "promethazine", "yay", "yola", "coke",
        "cola", "fishscale", "powder", "whitegirl", "snow", "crack", "rock", "work", "dope", "boy",
        "brown", "tar", "dogfood", "smack", "junk", "blow", "base", "meth", "crystal", "ice", "glass",
        "tina", "ket", "ketamine", "specialk", "ghb", "nitrous", "whippets", "budder", "wax", "shatter",
        "dab", "hash", "keef", "kief", "spice", "k2", "bathsalts", "flakka", "crank", "speed", "uppers",
        "downers", "bennies", "pharmies", "scripts", "syrup", "drank", "xan", "hydro", "mid", "regs",
        "diesel", "runtz", "biscotti", "mids", "loudpack", "fire", "og"
    ]
    crime_words = [
        "gat", "strap", "heat", "piece", "burner", "hammer", "iron", "toolie", "tool", "nine", "choppa",
        "stick", "blicky", "glizzy", "mac", "draco", "extendo", "clip", "hollows", "bust", "pop", "dump",
        "ride", "lick", "jug", "finesse", "trap", "block", "set", "hood", "ops", "snitch", "rat", "plug",
        "jack", "shank", "blade", "cut", "slide", "fold", "code", "bid", "yard", "cell", "joint", "fed",
        "narcs", "fiveo", "boys", "gang", "mob", "cartel", "homicide", "robbery", "theft", "assault",
        "weapon", "shooter"
    ]
    positive_words = [
        "love", "bless", "real", "rise", "glow", "shine", "peace", "unity", "strength", "hope", "solid",
        "strong", "motivate", "inspire", "encourage", "power", "growth", "success", "focus", "dream",
        "humble", "gratitude", "community", "elevate", "healing", "kindness", "wisdom", "faith", "thanks",
        "advance", "bright", "future", "balance", "calm", "genuine", "mend", "raise", "bloom", "flourish",
        "prosper", "spark", "aware", "mindful", "thankful", "joyful", "smile", "fearless", "mercy", "clarity"
    ]
    return money_words, women_words, drugs_words, crime_words, positive_words

# 単語のカテゴリチェック関数
def categorize_word(word, categories):
    if isinstance(word, str):
        return 1 if word.lower() in categories else 0
    return 0

# ディレクトリ内のCSVファイルを処理
def process_files_in_directory(directory):
    money_words, women_words, drugs_words, crime_words, positive_words = get_word_lists()

    for file_name in os.listdir(directory):
        if file_name.endswith(".csv"):
            file_path = os.path.join(directory, file_name)
            df = pd.read_csv(file_path)
            
            # カテゴリ別カラムの追加
            df['money'] = df['word'].apply(lambda x: categorize_word(x, money_words))
            df['women'] = df['word'].apply(lambda x: categorize_word(x, women_words))
            df['drugs'] = df['word'].apply(lambda x: categorize_word(x, drugs_words))
            df['crime'] = df['word'].apply(lambda x: categorize_word(x, crime_words))
            df['positive'] = df['word'].apply(lambda x: categorize_word(x, positive_words))
            
            # 上書き保存
            df.to_csv(file_path, index=False)
            print(f"Processed: {file_name}")

# ディレクトリの指定
directory_path = "./artist_lyrics_counts"
process_files_in_directory(directory_path)


Processed: 2p1fiYHYiXz9qi0JJyxBzN_lyrics.csv
Processed: 3q7HBObVc0L8jNeTe5Gofh_lyrics.csv
Processed: 3nFkdlSjzX9mRTtwJOzDYB_lyrics.csv
Processed: 2YZyLoL8N0Wb9xBt1NhZWg_lyrics.csv
Processed: 137W8MRPWKqSmrBGDBFSop_lyrics.csv
Processed: 246dkjvS1zLTtiykXe5h60_lyrics.csv
Processed: 1ZwdS5xdxEREPySFridCfh_lyrics.csv
Processed: 7hJcb9fa4alzcOq3EaNPoG_lyrics.csv
Processed: 6Ip8FS7vWT1uKkJSweANQK_lyrics.csv
Processed: 5f7VJjfbwm532GiveGC0ZK_lyrics.csv
Processed: 5cj0lLjcoR7YOSnhnX0Po5_lyrics.csv
Processed: 6DPYiyq5kWVQS4RGwxzPC7_lyrics.csv
Processed: 20qISvAhX20dpIbOOzGK3q_lyrics.csv
Processed: 4MCBfE4596Uoi2O4DtmEMz_lyrics.csv
Processed: 7dGJo4pcD2V6oG8kP0tJRR_lyrics.csv
Processed: 4kYSro6naA4h99UJvo89HB_lyrics.csv
Processed: 4V8LLVI7PbaPR0K2TGSxFF_lyrics.csv
Processed: 55Aa2cqylxrFIXC767Z865_lyrics.csv
Processed: 13ubrt8QOOCPljQ2FL1Kca_lyrics.csv
Processed: 3TVXtAsR1Inumwj472S9r4_lyrics.csv
Processed: 1URnnhqYAYcrqrcwql10ft_lyrics.csv
Processed: 2SrSdSvpminqmStGELCSNd_lyrics.csv
Processed:

# 歌詞データを分類

In [1]:
import os
import pandas as pd

# 入力ファイル名
input_file = "./output/tracks_rapper_with_lyrics.csv"

# 出力先ディレクトリを作成
output_dir = "all_artist_lyrics"
os.makedirs(output_dir, exist_ok=True)

# CSVファイルを読み込み
df = pd.read_csv(input_file)

# 歌詞がNaNの場合は空文字列に変換
df['lyrics'] = df['lyrics'].fillna("")

# アーティストごとにデータをグループ化
artist_groups = df.groupby('artist_id')

for artist_id, group in artist_groups:
    # artist_idごとの歌詞情報をまとめたDataFrameをそのままCSVへ出力
    # ファイル名はall_lyrics_{artist_id}.csvとする
    output_file = os.path.join(output_dir, f"all_lyrics_{artist_id}.csv")
    group.to_csv(output_file, index=False)
    print(f"Saved lyrics for artist_id {artist_id} to {output_file}")


Saved lyrics for artist_id 0Y5tJX1MQlPlqiwlOH1tJY to all_artist_lyrics/all_lyrics_0Y5tJX1MQlPlqiwlOH1tJY.csv
Saved lyrics for artist_id 0c173mlxpT3dSFRgMO8XPh to all_artist_lyrics/all_lyrics_0c173mlxpT3dSFRgMO8XPh.csv
Saved lyrics for artist_id 0hCNtLu0JehylgoiP8L4Gh to all_artist_lyrics/all_lyrics_0hCNtLu0JehylgoiP8L4Gh.csv
Saved lyrics for artist_id 137W8MRPWKqSmrBGDBFSop to all_artist_lyrics/all_lyrics_137W8MRPWKqSmrBGDBFSop.csv
Saved lyrics for artist_id 13ubrt8QOOCPljQ2FL1Kca to all_artist_lyrics/all_lyrics_13ubrt8QOOCPljQ2FL1Kca.csv
Saved lyrics for artist_id 181bsRPaVXVlUKXrxwZfHK to all_artist_lyrics/all_lyrics_181bsRPaVXVlUKXrxwZfHK.csv
Saved lyrics for artist_id 1RyvyyTE3xzB2ZywiAwp0i to all_artist_lyrics/all_lyrics_1RyvyyTE3xzB2ZywiAwp0i.csv
Saved lyrics for artist_id 1URnnhqYAYcrqrcwql10ft to all_artist_lyrics/all_lyrics_1URnnhqYAYcrqrcwql10ft.csv
Saved lyrics for artist_id 1ZwdS5xdxEREPySFridCfh to all_artist_lyrics/all_lyrics_1ZwdS5xdxEREPySFridCfh.csv
Saved lyrics for ar