In [1]:
print(sc.uiWebUrl)

http://midway3-0172.rcc.local:4041


In [2]:
! squeue -u yjia2

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          41323339   caslake _interac    yjia2  R       6:00      1 midway3-0172


In [3]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

font_path = "/home/yjia2/fonts/NotoSansCJKsc-Regular.otf"  
fm.fontManager.addfont(font_path)
zh_font = fm.FontProperties(fname=font_path)

plt.rcParams["font.family"] = zh_font.get_name()
plt.rcParams["axes.unicode_minus"] = False

from pyspark.sql import functions as F, types as T
from pyspark.ml.functions import vector_to_array
from pyspark.storagelevel import StorageLevel
from pyspark.ml.feature import Word2VecModel
from mpl_toolkits.mplot3d import Axes3D
from pyspark.ml.feature import Word2Vec
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA

In [7]:
# Pre-processing

BASE_PATH = "cleaned_segmented_parquet"

df_raw = spark.read.parquet(BASE_PATH)

df = (
    df_raw
    .withColumnRenamed("文本内容_清洗", "content_cleaned")
    .withColumnRenamed("年份", "year")
    .select("content_cleaned", "year")
)

df = df.withColumn("year", F.col("year").cast("int"))

df = (
    df
    .withColumn("content_cleaned", F.trim(F.col("content_cleaned")))
    .filter(
        F.col("content_cleaned").isNotNull()
        & (F.length("content_cleaned") > 0)
    )
)

df = (
    df
    .withColumn("tokens", F.split(F.col("content_cleaned"), r"\s+"))
    .withColumn("tokens", F.expr("filter(tokens, x -> x <> '')"))
)

df = (
    df
    .withColumn("len_tokens", F.size("tokens"))
    .filter(F.col("len_tokens") >= 3)
    .drop("len_tokens")
)

year_stats = (
    df.groupBy("year")
      .agg(
          F.count("*").alias("n_docs"),
          F.sum(F.size("tokens")).alias("n_tokens")
      )
      .orderBy("year")
)

year_stats.show(1, truncate=False)




+----+------+--------+
|year|n_docs|n_tokens|
+----+------+--------+
|1947|10769 |2711874 |
+----+------+--------+
only showing top 1 row



                                                                                

In [8]:
# Word2Vec hyper-parameters 
W2V_PARAMS = dict(
    vectorSize=150,
    windowSize=6,
    minCount=30,
    maxIter=3,          # epochs
    stepSize=0.025,     # learning rate
    numPartitions=100,
    maxSentenceLength=600,
    seed=42,
    inputCol="tokens",
    outputCol="w2v_features"
)

BASE_MODEL_DIR = "w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year"

In [9]:
years = (
    df.select("year")
      .distinct()
      .orderBy("year")
      .rdd.map(lambda r: r["year"])
      .collect()
)

print("Years to train:", years[:10], "... total =", len(years))

[Stage 24:>                                                         (0 + 1) / 1]

Years to train: [1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956] ... total = 78


                                                                                

In [10]:
MIN_DOCS = 500

for y in years:
    print(f"\n===== Training year {y} =====")

    df_y = (
        df.filter(F.col("year") == y)
          .select("tokens")
    )

    n_docs = df_y.count()
    print(f"Docs in {y} = {n_docs}")

    if n_docs < MIN_DOCS:
        print(f"Skip {y} (n_docs < MIN_DOCS={MIN_DOCS})")
        continue

    df_y_train = df_y.repartition(100).persist(StorageLevel.MEMORY_AND_DISK)
    _ = df_y_train.count()   
    print(f"{y}: cached training corpus")

    w2v = Word2Vec(**W2V_PARAMS)
    w2v_model = w2v.fit(df_y_train)
    print(f"{y}: model trained")

    df_y_train.unpersist()

    model_dir = f"{BASE_MODEL_DIR}/year={y}"
    print(f"{y}: saving model to {model_dir}")
    w2v_model.write().overwrite().save(model_dir)

    vecs = w2v_model.getVectors()   # DataFrame[word: string, vector: vector]
    vecs.write.mode("overwrite").parquet(f"{model_dir}/vectors_parquet")

    vocab_size = vecs.count()
    print(f"{y}: vocab_size = {vocab_size}")

print("\nAll yearly trainings done.")


===== Training year 1947 =====
Docs in 1947 = 10769


                                                                                

1947: cached training corpus


                                                                                

1947: model trained
1947: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1947


                                                                                

1947: vocab_size = 10630

===== Training year 1948 =====


                                                                                

Docs in 1948 = 8246


                                                                                

1948: cached training corpus


                                                                                

1948: model trained
1948: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1948
1948: vocab_size = 9372

===== Training year 1949 =====


                                                                                

Docs in 1949 = 18956


                                                                                

1949: cached training corpus


                                                                                

1949: model trained
1949: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1949
1949: vocab_size = 17189

===== Training year 1950 =====


                                                                                

Docs in 1950 = 19990


                                                                                

1950: cached training corpus


                                                                                

1950: model trained
1950: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1950
1950: vocab_size = 18651

===== Training year 1951 =====


                                                                                

Docs in 1951 = 13756


                                                                                

1951: cached training corpus


                                                                                

1951: model trained
1951: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1951
1951: vocab_size = 15638

===== Training year 1952 =====


                                                                                

Docs in 1952 = 13381


                                                                                

1952: cached training corpus


                                                                                

1952: model trained
1952: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1952
1952: vocab_size = 15649

===== Training year 1953 =====


                                                                                

Docs in 1953 = 12790


                                                                                

1953: cached training corpus


                                                                                

1953: model trained
1953: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1953
1953: vocab_size = 15426

===== Training year 1954 =====


                                                                                

Docs in 1954 = 13968


                                                                                

1954: cached training corpus


                                                                                

1954: model trained
1954: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1954
1954: vocab_size = 14829

===== Training year 1955 =====


                                                                                

Docs in 1955 = 15612


                                                                                

1955: cached training corpus


                                                                                

1955: model trained
1955: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1955
1955: vocab_size = 15561

===== Training year 1956 =====


                                                                                

Docs in 1956 = 23691


                                                                                

1956: cached training corpus


                                                                                

1956: model trained
1956: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1956
1956: vocab_size = 18127

===== Training year 1957 =====


                                                                                

Docs in 1957 = 25363


                                                                                

1957: cached training corpus


                                                                                

1957: model trained
1957: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1957
1957: vocab_size = 21444

===== Training year 1958 =====


                                                                                

Docs in 1958 = 28012


                                                                                

1958: cached training corpus


                                                                                

1958: model trained
1958: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1958
1958: vocab_size = 21781

===== Training year 1959 =====


                                                                                

Docs in 1959 = 22785


                                                                                

1959: cached training corpus


                                                                                

1959: model trained
1959: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1959
1959: vocab_size = 22565

===== Training year 1960 =====


                                                                                

Docs in 1960 = 23386


                                                                                

1960: cached training corpus


                                                                                

1960: model trained
1960: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1960
1960: vocab_size = 21979

===== Training year 1961 =====


                                                                                

Docs in 1961 = 21957


                                                                                

1961: cached training corpus


                                                                                

1961: model trained
1961: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1961
1961: vocab_size = 19853

===== Training year 1962 =====


                                                                                

Docs in 1962 = 17310


                                                                                

1962: cached training corpus


                                                                                

1962: model trained
1962: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1962
1962: vocab_size = 16279

===== Training year 1963 =====


                                                                                

Docs in 1963 = 15189


                                                                                

1963: cached training corpus


                                                                                

1963: model trained
1963: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1963
1963: vocab_size = 15311

===== Training year 1964 =====


                                                                                

Docs in 1964 = 16606


                                                                                

1964: cached training corpus


                                                                                

1964: model trained
1964: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1964
1964: vocab_size = 16595

===== Training year 1965 =====


                                                                                

Docs in 1965 = 16221


                                                                                

1965: cached training corpus


                                                                                

1965: model trained
1965: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1965
1965: vocab_size = 15970

===== Training year 1966 =====


                                                                                

Docs in 1966 = 12171


                                                                                

1966: cached training corpus


                                                                                

1966: model trained
1966: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1966
1966: vocab_size = 13148

===== Training year 1967 =====


                                                                                

Docs in 1967 = 9274


                                                                                

1967: cached training corpus


                                                                                

1967: model trained
1967: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1967
1967: vocab_size = 9744

===== Training year 1968 =====


                                                                                

Docs in 1968 = 9216


                                                                                

1968: cached training corpus


                                                                                

1968: model trained
1968: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1968
1968: vocab_size = 10021

===== Training year 1969 =====


                                                                                

Docs in 1969 = 10124


                                                                                

1969: cached training corpus


                                                                                

1969: model trained
1969: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1969
1969: vocab_size = 10851

===== Training year 1970 =====


                                                                                

Docs in 1970 = 10570


                                                                                

1970: cached training corpus


                                                                                

1970: model trained
1970: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1970
1970: vocab_size = 11448

===== Training year 1971 =====


                                                                                

Docs in 1971 = 10421


                                                                                

1971: cached training corpus


                                                                                

1971: model trained
1971: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1971
1971: vocab_size = 10916

===== Training year 1972 =====


                                                                                

Docs in 1972 = 14333


                                                                                

1972: cached training corpus


                                                                                

1972: model trained
1972: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1972
1972: vocab_size = 12454

===== Training year 1973 =====


                                                                                

Docs in 1973 = 14845


                                                                                

1973: cached training corpus


                                                                                

1973: model trained
1973: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1973
1973: vocab_size = 13151

===== Training year 1974 =====


                                                                                

Docs in 1974 = 12837


                                                                                

1974: cached training corpus


                                                                                

1974: model trained
1974: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1974
1974: vocab_size = 12942

===== Training year 1975 =====


                                                                                

Docs in 1975 = 13362


                                                                                

1975: cached training corpus


                                                                                

1975: model trained
1975: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1975
1975: vocab_size = 12445

===== Training year 1976 =====


                                                                                

Docs in 1976 = 12262


                                                                                

1976: cached training corpus


                                                                                

1976: model trained
1976: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1976
1976: vocab_size = 11610

===== Training year 1977 =====


                                                                                

Docs in 1977 = 12715


                                                                                

1977: cached training corpus


                                                                                

1977: model trained
1977: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1977
1977: vocab_size = 12849

===== Training year 1978 =====


                                                                                

Docs in 1978 = 13431


                                                                                

1978: cached training corpus


                                                                                

1978: model trained
1978: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1978
1978: vocab_size = 14525

===== Training year 1979 =====


                                                                                

Docs in 1979 = 17537


                                                                                

1979: cached training corpus


                                                                                

1979: model trained
1979: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1979
1979: vocab_size = 17176

===== Training year 1980 =====


                                                                                

Docs in 1980 = 27226


                                                                                

1980: cached training corpus


                                                                                

1980: model trained
1980: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1980
1980: vocab_size = 20371

===== Training year 1981 =====


                                                                                

Docs in 1981 = 28360


                                                                                

1981: cached training corpus


                                                                                

1981: model trained
1981: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1981
1981: vocab_size = 20841

===== Training year 1982 =====


                                                                                

Docs in 1982 = 28324


                                                                                

1982: cached training corpus


                                                                                

1982: model trained
1982: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1982
1982: vocab_size = 20705

===== Training year 1983 =====


                                                                                

Docs in 1983 = 31138


                                                                                

1983: cached training corpus


                                                                                

1983: model trained
1983: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1983
1983: vocab_size = 20278

===== Training year 1984 =====


                                                                                

Docs in 1984 = 31964


                                                                                

1984: cached training corpus


                                                                                

1984: model trained
1984: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1984
1984: vocab_size = 21164

===== Training year 1985 =====


                                                                                

Docs in 1985 = 34299


                                                                                

1985: cached training corpus


                                                                                

1985: model trained
1985: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1985
1985: vocab_size = 22854

===== Training year 1986 =====


                                                                                

Docs in 1986 = 33502


                                                                                

1986: cached training corpus


                                                                                

1986: model trained
1986: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1986
1986: vocab_size = 23095

===== Training year 1987 =====


                                                                                

Docs in 1987 = 32663


                                                                                

1987: cached training corpus


                                                                                

1987: model trained
1987: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1987
1987: vocab_size = 22589

===== Training year 1988 =====


                                                                                

Docs in 1988 = 33530


                                                                                

1988: cached training corpus


                                                                                

1988: model trained
1988: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1988
1988: vocab_size = 22579

===== Training year 1989 =====


                                                                                

Docs in 1989 = 30017


                                                                                

1989: cached training corpus


                                                                                

1989: model trained
1989: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1989
1989: vocab_size = 21810

===== Training year 1990 =====


                                                                                

Docs in 1990 = 31702


                                                                                

1990: cached training corpus


                                                                                

1990: model trained
1990: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1990
1990: vocab_size = 21883

===== Training year 1991 =====


                                                                                

Docs in 1991 = 33818


                                                                                

1991: cached training corpus


                                                                                

1991: model trained
1991: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1991
1991: vocab_size = 21012

===== Training year 1992 =====


                                                                                

Docs in 1992 = 37363


                                                                                

1992: cached training corpus


                                                                                

1992: model trained
1992: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1992
1992: vocab_size = 21540

===== Training year 1993 =====


                                                                                

Docs in 1993 = 36495


                                                                                

1993: cached training corpus


                                                                                

1993: model trained
1993: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1993
1993: vocab_size = 21416

===== Training year 1994 =====


                                                                                

Docs in 1994 = 33985


                                                                                

1994: cached training corpus


                                                                                

1994: model trained
1994: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1994
1994: vocab_size = 22984

===== Training year 1995 =====


                                                                                

Docs in 1995 = 37602


                                                                                

1995: cached training corpus


                                                                                

1995: model trained
1995: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1995
25/11/13 16:05:00 WARN TaskSetManager: Stage 1540 contains a task of very large size (1060 KiB). The maximum recommended task size is 1000 KiB.
1995: vocab_size = 25984

===== Training year 1996 =====


                                                                                

Docs in 1996 = 37116


                                                                                

1996: cached training corpus


                                                                                

1996: model trained
1996: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1996
25/11/13 16:07:44 WARN TaskSetManager: Stage 1571 contains a task of very large size (1047 KiB). The maximum recommended task size is 1000 KiB.
1996: vocab_size = 25683

===== Training year 1997 =====


                                                                                

Docs in 1997 = 34310


                                                                                

1997: cached training corpus


                                                                                

1997: model trained
1997: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1997
25/11/13 16:10:15 WARN TaskSetManager: Stage 1602 contains a task of very large size (1052 KiB). The maximum recommended task size is 1000 KiB.
1997: vocab_size = 25806

===== Training year 1998 =====


                                                                                

Docs in 1998 = 33701


                                                                                

1998: cached training corpus


                                                                                

1998: model trained
1998: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1998
25/11/13 16:12:48 WARN TaskSetManager: Stage 1633 contains a task of very large size (1060 KiB). The maximum recommended task size is 1000 KiB.
1998: vocab_size = 25987

===== Training year 1999 =====


                                                                                

Docs in 1999 = 34816


                                                                                

1999: cached training corpus


                                                                                

1999: model trained
1999: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=1999
25/11/13 16:15:33 WARN TaskSetManager: Stage 1664 contains a task of very large size (1037 KiB). The maximum recommended task size is 1000 KiB.
1999: vocab_size = 25434

===== Training year 2000 =====


                                                                                

Docs in 2000 = 34707


                                                                                

2000: cached training corpus


                                                                                

2000: model trained
2000: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2000
25/11/13 16:18:19 WARN TaskSetManager: Stage 1695 contains a task of very large size (1033 KiB). The maximum recommended task size is 1000 KiB.
2000: vocab_size = 25338

===== Training year 2001 =====


                                                                                

Docs in 2001 = 35614


                                                                                

2001: cached training corpus


                                                                                

2001: model trained
2001: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2001
25/11/13 16:20:55 WARN TaskSetManager: Stage 1726 contains a task of very large size (1018 KiB). The maximum recommended task size is 1000 KiB.
2001: vocab_size = 24989

===== Training year 2002 =====


                                                                                

Docs in 2002 = 34787


                                                                                

2002: cached training corpus


                                                                                

2002: model trained
2002: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2002
25/11/13 16:23:38 WARN TaskSetManager: Stage 1757 contains a task of very large size (1007 KiB). The maximum recommended task size is 1000 KiB.
2002: vocab_size = 24692

===== Training year 2003 =====


                                                                                

Docs in 2003 = 39173


                                                                                

2003: cached training corpus


                                                                                

2003: model trained
2003: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2003
25/11/13 16:27:03 WARN TaskSetManager: Stage 1788 contains a task of very large size (1198 KiB). The maximum recommended task size is 1000 KiB.
2003: vocab_size = 29406

===== Training year 2004 =====


                                                                                

Docs in 2004 = 38535


                                                                                

2004: cached training corpus


                                                                                

2004: model trained
2004: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2004
25/11/13 16:30:16 WARN TaskSetManager: Stage 1819 contains a task of very large size (1184 KiB). The maximum recommended task size is 1000 KiB.
2004: vocab_size = 29050

===== Training year 2005 =====


                                                                                

Docs in 2005 = 36653


                                                                                

2005: cached training corpus


                                                                                

2005: model trained
2005: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2005
25/11/13 16:33:49 WARN TaskSetManager: Stage 1850 contains a task of very large size (1168 KiB). The maximum recommended task size is 1000 KiB.
2005: vocab_size = 28664

===== Training year 2006 =====


                                                                                

Docs in 2006 = 36454


                                                                                

2006: cached training corpus


                                                                                

2006: model trained
2006: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2006
25/11/13 16:37:32 WARN TaskSetManager: Stage 1881 contains a task of very large size (1168 KiB). The maximum recommended task size is 1000 KiB.
2006: vocab_size = 28658

===== Training year 2007 =====


                                                                                

Docs in 2007 = 35574


                                                                                

2007: cached training corpus


                                                                                

2007: model trained
2007: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2007
25/11/13 16:40:36 WARN TaskSetManager: Stage 1912 contains a task of very large size (1152 KiB). The maximum recommended task size is 1000 KiB.
2007: vocab_size = 28285

===== Training year 2008 =====


                                                                                

Docs in 2008 = 32226


                                                                                

2008: cached training corpus


                                                                                

2008: model trained
2008: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2008
25/11/13 16:43:47 WARN TaskSetManager: Stage 1943 contains a task of very large size (1098 KiB). The maximum recommended task size is 1000 KiB.
2008: vocab_size = 26939

===== Training year 2009 =====


                                                                                

Docs in 2009 = 33795


                                                                                

2009: cached training corpus


                                                                                

2009: model trained
2009: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2009
25/11/13 16:47:57 WARN TaskSetManager: Stage 1974 contains a task of very large size (1175 KiB). The maximum recommended task size is 1000 KiB.
2009: vocab_size = 28802

===== Training year 2010 =====


                                                                                

Docs in 2010 = 42658


                                                                                

2010: cached training corpus


                                                                                

2010: model trained
2010: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2010
25/11/13 16:52:28 WARN TaskSetManager: Stage 2005 contains a task of very large size (1364 KiB). The maximum recommended task size is 1000 KiB.
2010: vocab_size = 33482

===== Training year 2011 =====


                                                                                

Docs in 2011 = 39661


                                                                                

2011: cached training corpus


                                                                                

2011: model trained
2011: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2011
25/11/13 16:57:40 WARN TaskSetManager: Stage 2036 contains a task of very large size (1402 KiB). The maximum recommended task size is 1000 KiB.
2011: vocab_size = 34427

===== Training year 2012 =====


                                                                                

Docs in 2012 = 37041


                                                                                

2012: cached training corpus


                                                                                

2012: model trained
2012: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2012
25/11/13 17:02:05 WARN TaskSetManager: Stage 2067 contains a task of very large size (1344 KiB). The maximum recommended task size is 1000 KiB.
2012: vocab_size = 32990

===== Training year 2013 =====


                                                                                

Docs in 2013 = 40442


                                                                                

2013: cached training corpus


                                                                                

2013: model trained
2013: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2013


                                                                                

25/11/13 17:07:12 WARN TaskSetManager: Stage 2098 contains a task of very large size (1440 KiB). The maximum recommended task size is 1000 KiB.
2013: vocab_size = 35342

===== Training year 2014 =====


                                                                                

Docs in 2014 = 40244


                                                                                

2014: cached training corpus


                                                                                

2014: model trained
2014: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2014
25/11/13 17:11:32 WARN TaskSetManager: Stage 2129 contains a task of very large size (1446 KiB). The maximum recommended task size is 1000 KiB.
2014: vocab_size = 35515

===== Training year 2015 =====


                                                                                

Docs in 2015 = 38856


                                                                                

2015: cached training corpus


                                                                                

2015: model trained
2015: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2015


                                                                                

25/11/13 17:16:11 WARN TaskSetManager: Stage 2160 contains a task of very large size (1439 KiB). The maximum recommended task size is 1000 KiB.
2015: vocab_size = 35304

===== Training year 2016 =====


                                                                                

Docs in 2016 = 37060


                                                                                

2016: cached training corpus


                                                                                

2016: model trained
2016: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2016
25/11/13 17:21:11 WARN TaskSetManager: Stage 2191 contains a task of very large size (1409 KiB). The maximum recommended task size is 1000 KiB.
2016: vocab_size = 34561

===== Training year 2017 =====


                                                                                

Docs in 2017 = 35296


                                                                                

2017: cached training corpus


                                                                                

2017: model trained
2017: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2017
25/11/13 17:25:46 WARN TaskSetManager: Stage 2222 contains a task of very large size (1347 KiB). The maximum recommended task size is 1000 KiB.
2017: vocab_size = 33087

===== Training year 2018 =====


                                                                                

Docs in 2018 = 33780


                                                                                

2018: cached training corpus


                                                                                

2018: model trained
2018: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2018
25/11/13 17:30:34 WARN TaskSetManager: Stage 2253 contains a task of very large size (1341 KiB). The maximum recommended task size is 1000 KiB.
2018: vocab_size = 32903

===== Training year 2019 =====


                                                                                

Docs in 2019 = 24557


                                                                                

2019: cached training corpus


                                                                                

2019: model trained
2019: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2019
25/11/13 17:33:53 WARN TaskSetManager: Stage 2284 contains a task of very large size (1124 KiB). The maximum recommended task size is 1000 KiB.
2019: vocab_size = 27564

===== Training year 2020 =====


                                                                                

Docs in 2020 = 25675


                                                                                

2020: cached training corpus


                                                                                

2020: model trained
2020: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2020
25/11/13 17:37:18 WARN TaskSetManager: Stage 2315 contains a task of very large size (1104 KiB). The maximum recommended task size is 1000 KiB.
2020: vocab_size = 27086

===== Training year 2021 =====


                                                                                

Docs in 2021 = 23474


                                                                                

2021: cached training corpus


                                                                                

2021: model trained
2021: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2021
25/11/13 17:40:51 WARN TaskSetManager: Stage 2346 contains a task of very large size (1069 KiB). The maximum recommended task size is 1000 KiB.
2021: vocab_size = 26239

===== Training year 2022 =====


                                                                                

Docs in 2022 = 22248


                                                                                

2022: cached training corpus


                                                                                

2022: model trained
2022: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2022
2022: vocab_size = 24397

===== Training year 2023 =====


                                                                                

Docs in 2023 = 24397


                                                                                

2023: cached training corpus


                                                                                

2023: model trained
2023: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2023
25/11/13 17:47:42 WARN TaskSetManager: Stage 2408 contains a task of very large size (1061 KiB). The maximum recommended task size is 1000 KiB.
2023: vocab_size = 26018

===== Training year 2024 =====


                                                                                

Docs in 2024 = 17096


                                                                                

2024: cached training corpus


                                                                                

2024: model trained
2024: saving model to w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year/year=2024
2024: vocab_size = 19852

All yearly trainings done.


In [12]:
from pyspark.ml.feature import Word2VecModel

BASE_MODEL_DIR = "w2v_models/pd_w2v_vs150_mc30_ep3_ws6_by_year"

def show_neighbors_for_year(year: int, word: str, topn: int = 20):
    """
    Load the yearly Word2Vec model and show nearest neighbors for a given word.
    """
    model_path = f"{BASE_MODEL_DIR}/year={year}"
    print(f"\n=== Year {year} | word = {word} ===")

    # Load model
    w2v_model = Word2VecModel.load(model_path)

    # Check if the word exists first (avoid errors)
    vecs = w2v_model.getVectors()
    if vecs.filter(F.col("word") == word).limit(1).count() == 0:
        print(f"'{word}' not in vocabulary for year {year}.")
        return

    # Show nearest neighbors
    syn = w2v_model.findSynonyms(word, topn)
    syn.show(truncate=False)

# Example: check a few years
for y in [1950, 1978, 1992, 2008, 2018]:
    show_neighbors_for_year(y, "人民", topn=15)



=== Year 1950 | word = 人民 ===


                                                                                

+--------+-------------------+
|word    |similarity         |
+--------+-------------------+
|万众一心|0.5745235085487366 |
|正义事业|0.5567136406898499 |
|自由民主|0.5509663820266724 |
|死敌    |0.5310417413711548 |
|人民意志|0.5274190306663513 |
|正义    |0.5226737260818481 |
|各族人民|0.5216115117073059 |
|英勇斗争|0.5147422552108765 |
|不可动摇|0.5076724290847778 |
|爱好和平|0.5045686960220337 |
|正义战争|0.5042296051979065 |
|解放事业|0.5029891133308411 |
|深信    |0.5010573863983154 |
|坚信    |0.49985629320144653|
|亿万人民|0.49916237592697144|
+--------+-------------------+


=== Year 1978 | word = 人民 ===
+--------+------------------+
|word    |similarity        |
+--------+------------------+
|正义事业|0.6518587470054626|
|同情    |0.5929450988769531|
|始终不渝|0.5833016633987427|
|解放事业|0.5722860097885132|
|珍视    |0.5614731907844543|
|声援    |0.5534377098083496|
|愿望    |0.5524501800537109|
|正义    |0.5487756133079529|
|正义斗争|0.5425110459327698|
|衷心祝愿|0.5412216186523438|
|昌盛    |0.5387455224990845|
|民族解放|0.5379835963249207|
|坚信    |0.5296506