In [2]:
import pandas as pd
import datetime as dt

url = "https://raw.githubusercontent.com/JunHCha/K-POP-Lyrics-1964-2020/main/data/lyrics_cooccured/cooccured_words_by_line.csv"
words = pd.read_csv(url, encoding="utf-8").iloc[:, 1:]
words


Unnamed: 0,genre,year,song_id,title,line,noun,predicate
0,성인가요,1964.0,4083218.0,내일또 만납시다,0.0,하루(N),끝내(P)
1,성인가요,1964.0,4083218.0,내일또 만납시다,0.0,하루(N),돌아가(P)
2,성인가요,1964.0,4083218.0,내일또 만납시다,0.0,일(N),끝내(P)
3,성인가요,1964.0,4083218.0,내일또 만납시다,0.0,일(N),돌아가(P)
4,성인가요,1964.0,4083218.0,내일또 만납시다,2.0,하늘(N),반짝이(P)
...,...,...,...,...,...,...,...
182946,발라드,2020.0,32998018.0,힘든 건 사랑이 아니다,15.0,그늘(N),받(P)
182947,발라드,2020.0,32998018.0,힘든 건 사랑이 아니다,15.0,사랑(N),가리(P)
182948,발라드,2020.0,32998018.0,힘든 건 사랑이 아니다,15.0,사랑(N),받(P)
182949,발라드,2020.0,32998018.0,힘든 건 사랑이 아니다,15.0,미안(N),가리(P)


In [4]:
years = list(range(1985, 2021))
edges_filtered = pd.DataFrame()
for year in years:
    words_in_year = words[words["year"] == year]
    edges_cache = {}

    for _, word in words_in_year.iterrows():
        edges_cache.update(
            {
                (word["noun"], word["predicate"]): {
                    "weight": edges_cache.get(
                        (word["noun"], word["predicate"]), {}
                    ).get("weight", 0)
                    + 1
                }
            }
        )
    edges = pd.DataFrame.from_dict(edges_cache, orient="index")
    edges = edges.loc[edges["weight"] >= edges.quantile(0.95).weight].reset_index()
    edges["year"] = year
    edges.columns = ["Source", "Target", "Weight", "Year"]
    edges = edges[["Year", "Source", "Target", "Weight"]]
    edges.index.names = ["ids"]
    edges_filtered = pd.concat([edges_filtered, edges])

edges_filtered


Unnamed: 0_level_0,Year,Source,Target,Weight
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1985,가슴(N),그리(P),4
1,1985,가슴(N),남(P),5
2,1985,가슴(N),뜨겁(P),4
3,1985,가슴(N),작(P),6
4,1985,가슴(N),저리(P),4
...,...,...,...,...
119,2020,함성(N),멎(P),4
120,2020,햇살(N),같(P),5
121,2020,회상(N),멀(P),4
122,2020,후회(N),남(P),4


In [5]:
edges_filtered.to_csv(
    f"../data/lyrics_cooccured/cooccured_weighted_edges_1985_2020.csv",
    encoding="utf-8-sig",
)


In [7]:
node_dict = {}
for row in edges_filtered.itertuples():
    node_dict.update(
        {
            row.Source: {
                "Year": sorted(
                    list(
                        set(node_dict.get(row.Source, {}).get("Year", []) + [row.Year])
                    )
                ),
            }
        }
    )
    node_dict.update(
        {
            row.Target: {
                "Year": sorted(
                    list(
                        set(node_dict.get(row.Target, {}).get("Year", []) + [row.Year])
                    )
                ),
            }
        }
    )


def divide_years_list(list_to_divide):
    divided_list = []
    for i in range(len(list_to_divide)):
        if i == 0:
            divided_list.append([list_to_divide[i]])
        else:
            if list_to_divide[i] - list_to_divide[i - 1] > 1:
                divided_list.append([list_to_divide[i]])
            else:
                divided_list[-1].append(list_to_divide[i])
    return divided_list


def years_to_interval(divided_list):
    string = "<"
    for each in divided_list:
        if len(each) == 1:
            string += f"[{each[0]},{each[0]}];"
        else:
            string += f"[{each[0]},{each[-1]}];"
    string += ">"
    return string


for node, value in node_dict.items():
    node_dict[node]["Year"] = years_to_interval(divide_years_list(value["Year"]))


In [9]:
def node_dict_to_df(node_dict):

    data = {"Id": [], "Label": [], "Interval": []}
    for node, value in node_dict.items():
        data["Id"].append(node)
        data["Label"].append(node)
        data["Interval"].append(value["Year"])

    df = pd.DataFrame.from_dict(data).reset_index(drop=True)
    return df


node_dict_to_df(node_dict).to_csv(
    f"../data/lyrics_cooccured/cooccured_nodes_1985_2020.csv",
    encoding="utf-8-sig",
    index=False,
)
