In [113]:
import pandas as pd


url = "https://raw.githubusercontent.com/JunHCha/K-POP-Lyrics-1964-2020/main/data/lyrics_original/lyric_word_parsed_by_line.csv"
words_df = pd.read_csv(url, encoding="utf-8")


In [114]:
song_count_df = (
    words_df.groupby(["year"])
    .nunique(dropna=False)[["song_id"]]
    .rename(columns={"song_id": "song_count_by_year"})
)
song_count_df


Unnamed: 0_level_0,song_count_by_year
year,Unnamed: 1_level_1
1964,19
1965,10
1966,24
1967,35
1968,26
1969,30
1970,21
1971,26
1972,39
1973,35


In [115]:
words_with_song_count_df = pd.merge(words_df, song_count_df, on="year", how="inner")


words_by_year_df = (
    words_with_song_count_df.groupby(["word", "tag_orig", "year"])
    .agg(
        count=("word", "count"),
        song_count_by_year=("song_count_by_year", "last"),
    )
    .reset_index()
)

words_by_year_df["ratio"] = (
    words_by_year_df["count"] / words_by_year_df["song_count_by_year"]
)
words_by_year_df["mean_ratio"] = words_by_year_df.groupby(["word", "tag_orig"])[
    "ratio"
].transform("mean")
words_by_year_df = words_by_year_df.sort_values("mean_ratio", ascending=False)
words_by_year_df


Unnamed: 0,word,tag_orig,year,count,song_count_by_year,ratio,mean_ratio
34360,사랑,NNG,1994,172,83,2.072289,2.475246
34348,사랑,NNG,1982,108,37,2.918919,2.475246
34357,사랑,NNG,1991,181,72,2.513889,2.475246
34356,사랑,NNG,1990,166,78,2.128205,2.475246
34355,사랑,NNG,1989,238,84,2.833333,2.475246
...,...,...,...,...,...,...,...
52831,잊어가나,VV,2005,1,99,0.010101,0.010101
17660,되삼키,VV,2002,1,99,0.010101,0.010101
7034,기복,NNG,2002,1,99,0.010101,0.010101
65785,한목숨,NNG,2002,1,99,0.010101,0.010101


In [125]:
top_100_words_df = (
    words_by_year_df.groupby(["word", "tag_orig"])
    .agg(mean_ratio=("mean_ratio", "last"))
    .reset_index()
    .sort_values("mean_ratio", ascending=False)
)
top_100_words_df = (
    top_100_words_df.loc[top_100_words_df["tag_orig"] == "NNG"]
    .nlargest(columns="mean_ratio", n=100)
    .reset_index()[["word", "tag_orig", "mean_ratio"]]
)
top_100_words_df["rank"] = top_100_words_df["mean_ratio"].rank(ascending=False)
top_100_words_df


Unnamed: 0,word,tag_orig,mean_ratio,rank
0,사랑,NNG,2.475246,1.0
1,말,NNG,1.096001,2.0
2,목화밭,NNG,0.838710,3.0
3,마음,NNG,0.773363,4.0
4,바야,NNG,0.666667,5.0
...,...,...,...,...
95,물보라,NNG,0.164384,96.0
96,순간,NNG,0.163475,97.0
97,얼굴,NNG,0.163051,98.0
98,종이학,NNG,0.162162,99.0


In [143]:
words_by_year_top_100_df = pd.merge(
    top_100_words_df, words_by_year_df, on=("word", "tag_orig"), how="inner"
)[["word", "rank", "year", "song_count_by_year", "ratio", "mean_ratio_x"]].sort_values(
    ["year", "rank"], ascending=True
)[
    ["year", "word", "ratio"]
]
words_by_year_top_100_df = words_by_year_top_100_df.pivot_table(
    words_by_year_top_100_df, index="year", columns=words_by_year_top_100_df["word"]
).fillna(0)


In [146]:
words_by_year_top_100_df.to_csv("../data/lyrics_count/lyric_word_counted_top_100_NORM.csv")