### Install PySpark on Kaggle

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.4.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.4-py2.py3-none-any.whl size=317849765 sha256=c24f9898ad04125c7b50caa02f45c86e72412d623526a1b784f8aad25ab4aec1
  Stored in directory: /root/.cache/pip/wheels/d9/1c/98/31e395a42d1735d18d42124971ecbbade844b50bb9845b6f4a
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.4


# Import necessary libraries


In [40]:
from pyspark.sql import SparkSession
import time
from operator import add
from collections import Counter
from nltk.util import ngrams


# Part 1

In [16]:
start_time = time.time()

spark =  SparkSession\
        .builder\
        .appName("Shahname")\
        .getOrCreate()

# Load Shahname
input_path = "/kaggle/input/dmls-ca3/shahname.txt"

lines = spark.read.text(input_path).rdd.map(lambda r: r[0])

# Count lines
total_verses = lines.count()
print(f"Total Verses: {total_verses}")

# Lines to words
counts = lines.flatMap(lambda x: x.split()) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)
# Count words
total_words = counts.map(lambda x: x[1]).sum()
unique_words = counts.count()

print(f"Total Words: {int(total_words)}")
print(f"Unique Words: {unique_words}")
spark.stop()

end_time = time.time()
print(f"Time: {end_time - start_time:.2f} seconds")

Total Verses: 51580
Total Words: 570849
Unique Words: 18103
Time: 2.87 seconds


# Without Spark

In [14]:
start_time = time.time()

with open("/kaggle/input/dmls-ca3/shahname.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

total_verses = len(lines)
print(f"Total Verses: {total_verses}")

words = [word for line in lines for word in line.split()]

total_words = len(words)
print(f"Total Words: {total_words}")

end_time = time.time()
print(f"Time: {end_time - start_time:.2f} seconds")

Total Verses: 51580
Total Words: 570849
Time: 0.15 seconds


# Part 2

In [17]:
def extract_rhyme(line):
    if line.strip():  
        words = line.split()
        return words[-1]  # last word
    return None  

In [22]:
start_time = time.time()

spark =  SparkSession\
        .builder\
        .appName("Ghafieh")\
        .getOrCreate()

# Load Shahname
input_path = "/kaggle/input/dmls-ca3/shahname.txt"

lines = spark.read.text(input_path).rdd.map(lambda row: row[0])

# Last words
rhymes = lines.map(extract_rhyme)

# Count 
counts = rhymes.filter(lambda rhyme: rhyme is not None) \
                     .map(lambda rhyme: (rhyme, 1)) \
                     .reduceByKey(add)

# Top 10
top_rhymes = counts.takeOrdered(10, key=lambda x: -x[1])

print("Top 10 Frequent Rhymes:")
for rhyme, count in top_rhymes:
    print(f"{rhyme}: {count}")

spark.stop()

end_time = time.time()
print(f"Time: {end_time - start_time:.2f} seconds")

Top 10 Frequent Rhymes:
بود: 881
سپاه: 632
راه: 518
شاه: 463
اوی: 423
کرد: 412
را: 385
روی: 381
شد: 363
زمین: 338
Time: 2.72 seconds


### Without Spark

In [24]:
start_time = time.time()

# Load the text file
with open(input_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

rhymes = [extract_rhyme(line) for line in lines if extract_rhyme(line) is not None]

rhyme_counts = Counter(rhymes)

top_rhymes = rhyme_counts.most_common(10)

print("Top 10 Frequent Rhymes:")
for rhyme, count in top_rhymes:
    print(f"{rhyme}: {count}")

end_time = time.time()
print(f"Time: {end_time - start_time:.2f} seconds")

Top 10 Frequent Rhymes:
بود: 881
سپاه: 632
راه: 518
شاه: 463
اوی: 423
کرد: 412
را: 385
روی: 381
شد: 363
زمین: 338
Time: 0.17 seconds


# Part 3

### Function for 3-gram

In [38]:
def map_to_trigrams(line):
    words = line.split()  
    trigrams = []  
    if len(words) < 3:  
        return trigrams
    for i in range(len(words) - 2):  
        trigram = words[i] + " " + words[i+1] + " " + words[i+2]
        trigrams.append((trigram, 1))  
    return trigrams

In [48]:
start_time = time.time()

spark =  SparkSession\
        .builder\
        .appName("trigram")\
        .getOrCreate()

# Load Shahname
input_path = "/kaggle/input/dmls-ca3/shahname.txt"
lines = spark.read.text(input_path).rdd.map(lambda row: row[0])
lines = lines.filter(lambda line: line.strip() != "")

trigram_counts = lines.flatMap(map_to_trigrams) \
                      .reduceByKey(add)

total_trigrams = trigram_counts.count()
print(f"Total 3-grams: {total_trigrams}")

# Top 10 
top_trigrams = trigram_counts.takeOrdered(10, key=lambda x: -x[1])
print("Top 10 Frequent 3-grams:")
for trigram, count in top_trigrams:
    print(f"{trigram}: {count}")

spark.stop()

end_time = time.time()
print(f"Time: {end_time - start_time:.2f} seconds")

Total 3-grams: 363060
Top 10 Frequent 3-grams:
چنین داد پاسخ: 390
داد پاسخ که: 283
چنین گفت با: 188
چنین گفت کای: 185
مر او را: 165
بدو گفت کای: 155
تاج و تخت: 145
ز هر سو: 126
نشست از بر: 124
فرود آمد از: 114
Time: 4.68 seconds


### Using NLTK

In [50]:
# Start measuring time
start_time = time.time()

# Load Shahname text file
with open(input_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

all_trigrams = []
for line in lines:
    words = line.split()
    if len(words) >= 3: 
        trigrams = ngrams(words, 3)  
        all_trigrams.extend(trigrams)

trigram_counts = Counter(all_trigrams)

total_trigrams = sum(trigram_counts.values())
print(f"Total 3-grams: {total_trigrams}")

top_trigrams = trigram_counts.most_common(10)
print("Top 10 Frequent 3-grams:")
for trigram, count in top_trigrams:
    print(f"{' '.join(trigram)}: {count}")

end_time = time.time()
print(f"Time: {end_time - start_time:.2f} seconds")

Total 3-grams: 470275
Top 10 Frequent 3-grams:
چنین داد پاسخ: 390
داد پاسخ که: 283
چنین گفت با: 188
چنین گفت کای: 185
مر او را: 165
بدو گفت کای: 155
تاج و تخت: 145
ز هر سو: 126
نشست از بر: 124
فرود آمد از: 114
Time: 0.61 seconds
