In interactive notebook, the `spark` object is already created.
Instructors tested with 1 driver, 6 executors of small e4 (24 cores, 192GB memory)

### Launch spark environment

In [1]:
import pandas as pd
#$get data 
file_path = 'test_dataset.csv'
df = pd.read_csv(file_path)
df = pd.DataFrame(df["review"])
df.columns = ['body']


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch


In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer

spark = SparkSession.builder.appName("example").getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
df = spark.createDataFrame(df)

#  tokenlize
#df = spark.createDataFrame([("hagging face bert",)], ["body"])
#df = spark.createDataFrame(df)

# Tokenization
tokenizer = Tokenizer(inputCol="body", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

tokenized = tokenizer.transform(df)


In [4]:
pandas_df = tokenized.toPandas()


In [4]:
pandas_df.head(10)

Unnamed: 0,body,words
0,I really liked this Summerslam due to the look...,"[i, really, liked, this, summerslam, due, to, ..."
1,Not many television shows appeal to quite as m...,"[not, many, television, shows, appeal, to, qui..."
2,The film quickly gets to a major chase scene w...,"[the, film, quickly, gets, to, a, major, chase..."
3,Jane Austen would definitely approve of this o...,"[jane, austen, would, definitely, approve, of,..."
4,Expectations were somewhat high for me when I ...,"[expectations, were, somewhat, high, for, me, ..."
5,I've watched this movie on a fairly regular ba...,"[i've, watched, this, movie, on, a, fairly, re..."
6,For once a story of hope highlighted over the ...,"[for, once, a, story, of, hope, highlighted, o..."
7,"Okay, I didn't get the Purgatory thing the fir...","[okay,, i, didn't, get, the, purgatory, thing,..."
8,I was very disappointed with this series. It h...,"[i, was, very, disappointed, with, this, serie..."
9,The first 30 minutes of Tinseltown had my fing...,"[the, first, 30, minutes, of, tinseltown, had,..."


In [5]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# text process
sentences = [' '.join(words) for words in pandas_df['words']]
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")


In [6]:
print(inputs['input_ids'].shape)


torch.Size([15000, 512])


In [10]:
ss=inputs

In [7]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# get data
model_path = "db10.pt"
model = torch.load(model_path)
batch_size = 32  
#eval mode
model.eval()
all_predictions = []
total_items = len(inputs['input_ids'])
#get prediction 
for i in range(0, total_items, batch_size):
    print(f"Processing batch {i//batch_size + 1}/{total_items//batch_size}: {round((i / total_items) * 100, 2)}% complete")
    input_ids_batch = inputs['input_ids'][i:i+batch_size]
    attention_mask_batch = inputs['attention_mask'][i:i+batch_size]
    input_ids_batch = input_ids_batch.to(device)
    attention_mask_batch = attention_mask_batch.to(device)
    batch = {'input_ids': input_ids_batch, 'attention_mask': attention_mask_batch}
    with torch.no_grad():
        outputs = model(**batch)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_classes = torch.argmax(predictions, dim=1)
        all_predictions.extend(predicted_classes.cpu().numpy())

# show resul t
print(all_predictions)


Processing batch 1/468: 0.0% complete
Processing batch 2/468: 0.21% complete
Processing batch 3/468: 0.43% complete
Processing batch 4/468: 0.64% complete
Processing batch 5/468: 0.85% complete
Processing batch 6/468: 1.07% complete
Processing batch 7/468: 1.28% complete
Processing batch 8/468: 1.49% complete
Processing batch 9/468: 1.71% complete
Processing batch 10/468: 1.92% complete
Processing batch 11/468: 2.13% complete
Processing batch 12/468: 2.35% complete
Processing batch 13/468: 2.56% complete
Processing batch 14/468: 2.77% complete
Processing batch 15/468: 2.99% complete
Processing batch 16/468: 3.2% complete
Processing batch 17/468: 3.41% complete
Processing batch 18/468: 3.63% complete
Processing batch 19/468: 3.84% complete
Processing batch 20/468: 4.05% complete
Processing batch 21/468: 4.27% complete
Processing batch 22/468: 4.48% complete
Processing batch 23/468: 4.69% complete
Processing batch 24/468: 4.91% complete
Processing batch 25/468: 5.12% complete
Processing 

In [8]:
print(all_predictions)


[1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 

In [9]:
import pandas as pd

predictions_df = pd.DataFrame(all_predictions, columns=['predictions'])

predictions_df.to_csv('p10.csv', index=False)


In [1]:
import pandas as pd

df = pd.read_csv('minhash-urls.txt', header=None, names=['URL'])

row_count = len(df)

print(f"Number of rows: {row_count}")


Number of rows: 4199926
