In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString, OneHotEncoder
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# from pyspark.sql import functions as F

In [2]:
from pyspark.sql.functions import col, collect_list, concat_ws, size, array, lit, when
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import split

In [3]:
!python3 --version

Python 3.6.8


In [None]:
team = "team3"
warehouse = "project/hive/warehouse"

spark = SparkSession.builder \
    .appName(f"{team} - Spark ML") \
    .master("yarn") \
    .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/20 00:24:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/20 00:24:34 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/05/20 00:24:34 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [8]:
spark.sql("USE team3_projectdb;")

DataFrame[]

In [51]:
moscow = spark.read.format("avro").table("team3_projectdb.moscow")

[Stage 50:>                                                         (0 + 1) / 1]

In [52]:
moscow = moscow.filter(
    ~col("tags").contains("'traffic_light'") 
)

In [None]:
filtered_moscow = moscow.filter(col("h3_09_center").isNotNull() & col("tags").isNotNull())
grouped_moscow = filtered_moscow.groupBy("h3_09_center") \
    .agg(concat_ws(" ", collect_list(col("tags"))).alias("combined_tags"))
grouped_moscow.show(10)

[Stage 50:>                 (0 + 1) / 1][Stage 52:>                 (0 + 1) / 2]

+---------------+--------------------+
|   h3_09_center|       combined_tags|
+---------------+--------------------+
|89118180927ffff|[('crossing', 'tr...|
|891181820abffff|[('highway', 'tra...|
|891181844c3ffff|[('addr:country',...|
|89118184c93ffff|[('sign', 'yes')]...|
|89118186067ffff|[('barrier', 'gat...|
|89118186173ffff|[('highway', 'tra...|
|89118186233ffff|[('name', 'Давыдо...|
|8911818635bffff|[('name', 'Клёнов...|
|89118186493ffff|[('barrier', 'gat...|
|8911818654fffff|[('name', 'Акулов...|
+---------------+--------------------+
only showing top 10 rows



[Stage 50:>                                                         (0 + 1) / 1]

In [15]:
transactions = spark.read.format("avro").table("team3_projectdb.transactions")
cash_withdrawals = spark.read.format("avro").table("team3_projectdb.cash_withdrawals")
locations = spark.read.format("avro").table("team3_projectdb.locations")

In [54]:
data = transactions.join(cash_withdrawals, ["h3_09", "customer_id"], "inner")
data = data.join(locations, ["h3_09"], "inner").drop("lat", "lon")

In [19]:
data.show(2)

                                                                                

+---------------+-----------+--------------+-----+-------+--------+-----+-------+---------+--------------+-----------+--------+
|          h3_09|customer_id|transaction_pk|count|    sum|     avg|  min|    max|      std|count_distinct|datetime_id|mcc_code|
+---------------+-----------+--------------+-----+-------+--------+-----+-------+---------+--------------+-----------+--------+
|8911aa7a6d3ffff|        107|            61|    4|3630.75|907.6875|423.0|1825.92| 640.2593|             2|          3|      13|
|8911aa7abd3ffff|        196|           119|   11|4172.97|379.3609| 93.0|  927.0|266.54895|             4|          3|      13|
+---------------+-----------+--------------+-----+-------+--------+-----+-------+---------+--------------+-----------+--------+
only showing top 2 rows



In [20]:
data.printSchema()

root
 |-- h3_09: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- transaction_pk: long (nullable = true)
 |-- count: short (nullable = true)
 |-- sum: float (nullable = true)
 |-- avg: float (nullable = true)
 |-- min: float (nullable = true)
 |-- max: float (nullable = true)
 |-- std: float (nullable = true)
 |-- count_distinct: short (nullable = true)
 |-- datetime_id: short (nullable = true)
 |-- mcc_code: short (nullable = true)



In [55]:
grouped_moscow = grouped_moscow.withColumnRenamed("h3_09_center", "h3_09")
grouped_moscow.printSchema()

root
 |-- h3_09: string (nullable = true)
 |-- combined_tags: string (nullable = false)



In [56]:
data = data.join(
    grouped_moscow,
    ["h3_09"],  
    "left"
)

In [57]:
prepared_data = data.withColumn(
    "tokens", 
    split(col("combined_tags"), " ")
)

In [58]:
prepared_data.printSchema()

root
 |-- h3_09: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- transaction_pk: long (nullable = true)
 |-- count: short (nullable = true)
 |-- sum: float (nullable = true)
 |-- avg: float (nullable = true)
 |-- min: float (nullable = true)
 |-- max: float (nullable = true)
 |-- std: float (nullable = true)
 |-- count_distinct: short (nullable = true)
 |-- datetime_id: short (nullable = true)
 |-- mcc_code: short (nullable = true)
 |-- combined_tags: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [None]:
prepared_data.show(5)

                                                                                

+---------------+-----------+--------------+-----+-------+--------+-----+-------+---------+--------------+-----------+--------+--------------------+--------------------+
|          h3_09|customer_id|transaction_pk|count|    sum|     avg|  min|    max|      std|count_distinct|datetime_id|mcc_code|       combined_tags|              tokens|
+---------------+-----------+--------------+-----+-------+--------+-----+-------+---------+--------------+-----------+--------+--------------------+--------------------+
|8911aa7a6d3ffff|        107|            61|    4|3630.75|907.6875|423.0|1825.92| 640.2593|             2|          3|      13|[('amenity', 'pha...|[[('amenity',, 'p...|
|8911aa7abd3ffff|        196|           119|   11|4172.97|379.3609| 93.0|  927.0|266.54895|             4|          3|      13|[('alt_name:en', ...|[[('alt_name:en',...|
|8911aa7a363ffff|        269|           164|    5|  343.0|    68.6| 37.0|  148.0|47.125366|             1|          3|      13|[('colour', 'red'...|[[

[Stage 50:>                                                         (0 + 1) / 1]

In [73]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, Tokenizer, HashingTF
tokenizer = Tokenizer(inputCol="token", outputCol="tokenized")
hashingTF = HashingTF(inputCol="tokenized", outputCol="tags_vectors", numFeatures=25)

In [71]:
prepared_data = prepared_data.withColumn(
    "token", 
    concat_ws(" ", col("tokens"))  # Объединяем элементы массива через пробел
)

In [72]:
prepared_data.show(10)

                                                                                

+---------------+-----------+--------------+-----+--------+---------+------+-------+---------+--------------+-----------+--------+--------------------+--------------------+--------------------+
|          h3_09|customer_id|transaction_pk|count|     sum|      avg|   min|    max|      std|count_distinct|datetime_id|mcc_code|       combined_tags|              tokens|               token|
+---------------+-----------+--------------+-----+--------+---------+------+-------+---------+--------------+-----------+--------+--------------------+--------------------+--------------------+
|8911aa7a6d3ffff|        107|            61|    4| 3630.75| 907.6875| 423.0|1825.92| 640.2593|             2|          3|      13|[('amenity', 'pha...|[[('amenity',, 'p...|[('amenity', 'pha...|
|8911aa7abd3ffff|        196|           119|   11| 4172.97| 379.3609|  93.0|  927.0|266.54895|             4|          3|      13|[('alt_name:en', ...|[[('alt_name:en',...|[('alt_name:en', ...|
|8911aa7a363ffff|        269| 

[Stage 87:>                                                         (0 + 1) / 1]

In [74]:
tokenized_df = tokenizer.transform(prepared_data)
result_df = hashingTF.transform(tokenized_df)

In [75]:
result_df.show(5)

[Stage 87:>                                                         (0 + 1) / 1]

+---------------+-----------+--------------+-----+-------+--------+-----+-------+---------+--------------+-----------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          h3_09|customer_id|transaction_pk|count|    sum|     avg|  min|    max|      std|count_distinct|datetime_id|mcc_code|       combined_tags|              tokens|               token|           tokenized|        tags_vectors|
+---------------+-----------+--------------+-----+-------+--------+-----+-------+---------+--------------+-----------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|8911aa7a6d3ffff|        107|            61|    4|3630.75|907.6875|423.0|1825.92| 640.2593|             2|          3|      13|[('amenity', 'pha...|[[('amenity',, 'p...|[('amenity', 'pha...|[[('amenity',, 'p...|(25,[0,1,2,3,4,5,...|
|8911aa7abd3ffff|        196|           119|   11|4172.97|379.3609| 

[Stage 87:>                                                         (0 + 1) / 1]

In [None]:
result_df = result_df.drop("combined_tags", "tokens", "token", "tokenized")

In [65]:
# word2vec = Word2Vec(
#     vectorSize=50,
#     minCount=1,
#     inputCol="tokens",
#     outputCol="embedding"
# )

word2vec = Word2Vec(
    vectorSize=25,         
    minCount=3,             
    windowSize=3,           
    inputCol="tokens",
    outputCol="embedding",
    stepSize=0.01          
)

[Stage 50:>                 (0 + 1) / 1][Stage 77:>                 (0 + 1) / 1]

In [60]:
empty_tokens_count = prepared_data.filter(
    (size(col("tokens")) == 0) |  
    col("tokens").isNull()       
).count()

                                                                                

In [61]:
print(empty_tokens_count)

4


In [62]:
FIXED_VALUE = ["unknown"]
prepared_data = prepared_data.withColumn(
    "tokens",
    when(
        (size(col("tokens")) == 0) | col("tokens").isNull(),  # Исправлены скобки
        array([lit(e) for e in FIXED_VALUE])                   # Правильное создание массива
    ).otherwise(col("tokens"))
)

In [68]:
from pyspark.ml.feature import FeatureHasher

# Создание экземпляра FeatureHasher
hasher = FeatureHasher(
    inputCols=["tokens"],  # Укажите столбцы для хеширования
    outputCol="embedding",
    numFeatures=25  # Желаемая размерность эмбеддингов
)

# Прямое преобразование данных БЕЗ .fit()
result_df = hasher.transform(prepared_data)

# Проверка результата
result_df.select("embedding").show(3, truncate=False)

IllegalArgumentException: requirement failed: FeatureHasher requires columns to be of numeric, boolean or string. Column tokens was array<string>

[Stage 77:>                 (0 + 1) / 1][Stage 87:>                 (0 + 1) / 1]

In [None]:
result_data = model.transform(prepared_data)

In [6]:
original_features = [
    "datetime_id", "count", "sum", 
    "avg", "min", "max", "std",
    "count_distinct"
]

In [7]:
data = data.na.drop(subset=original_features)

In [8]:
string_columns = ["h3_09"]

In [9]:
label_indexer = StringIndexer(inputCol="h3_09", outputCol="label").fit(data)
data = label_indexer.transform(data)

                                                                                

In [10]:
encoders = [
    OneHotEncoder(inputCol=f"label", outputCol=f"{column}_encoded")
    for column in string_columns
]

In [11]:
encoding_pipeline = Pipeline(stages=encoders)
encoded_data = encoding_pipeline.fit(data).transform(data)

In [12]:
data.head()

Row(h3_09='8911aa7a6d3ffff', customer_id=107, transaction_pk=61, count=4, sum=3630.75, avg=907.6875, min=423.0, max=1825.9200439453125, std=640.25927734375, count_distinct=2, datetime_id=3, mcc_code=13, label=122.0)

In [13]:
data.drop("h3_09")

DataFrame[customer_id: bigint, transaction_pk: bigint, count: smallint, sum: float, avg: float, min: float, max: float, std: float, count_distinct: smallint, datetime_id: smallint, mcc_code: smallint, label: double]

In [14]:
mcc_indexer = StringIndexer(inputCol="mcc_code", outputCol="mcc_code_index")

In [15]:
feature_cols = original_features + ["mcc_code_index"]
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

In [42]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    maxIter=10
)

In [43]:
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=25,       
    maxDepth=7,        
    seed=42
)

In [44]:
pipeline_lr = Pipeline(stages=[mcc_indexer, assembler, lr])
pipeline_rf = Pipeline(stages=[mcc_indexer, assembler, rf])

In [16]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [46]:
train_data.head()

                                                                                

Row(h3_09='8911818610bffff', customer_id=1924, transaction_pk=1143088, count=2, sum=6395.0, avg=3197.5, min=1198.0, max=5197.0, std=2827.719970703125, count_distinct=2, datetime_id=2, mcc_code=10, label=832.0)

In [34]:
model_rf = pipeline_rf.fit(train_data)
predictions_rf = model_rf.transform(test_data)

In [17]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

In [36]:
print("RandomForest Results:")
print(f"Accuracy: {evaluator_accuracy.evaluate(predictions_rf)}")
print(f"F1-Score: {evaluator_f1.evaluate(predictions_rf)}")

RandomForest Results:
Accuracy: 0.1786409567071475
F1-Score: 0.0689351678368335


In [43]:
lr_model = pipeline_lr.fit(train_data)

                                                                                

In [44]:
lr_predictions = lr_model.transform(test_data)

In [47]:
print("Logistic Regression Results:")
print(f"Accuracy: {evaluator_accuracy.evaluate(lr_predictions)}")
print(f"F1-Score: {evaluator_f1.evaluate(lr_predictions)}\n")


Logistic Regression Results:


                                                                                

Accuracy: 0.16850529958901148




F1-Score: 0.055522637971441635



                                                                                

In [None]:
# label_converter = IndexToString(
#     inputCol="prediction",
#     outputCol="predicted_h3_09",
#     labels=label_indexer.labels
# )

# final_predictions = label_converter.transform(gbt_predictions)
# final_predictions.select("h3_09", "predicted_h3_09").show(5)

In [60]:
import sys
print(sys.version)

3.11.7 (main, Mar 20 2025, 00:23:21) [GCC 4.8.5 20150623 (Red Hat 4.8.5-44)]


In [1]:
!pip install --user torch torchvision pytorch_lightning



In [2]:
import torch, torchvision
print(torch.__version__)         # Должно быть 1.13.1
print(torchvision.__version__)   # Должно быть 0.14.1

1.10.1+cu102
0.11.2+cu102


In [18]:
import torch
from torch import nn
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [5]:
import sys
print(sys.executable)  # Должен показывать путь к Python 3.11

/usr/bin/python3


In [19]:
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
import numpy as np


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/team3/BigData-project/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/team3/BigData-project/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/team3/BigData-project/.venv/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io

In [20]:
class SparkDataset(Dataset):
    def __init__(self, data):
        self.features = np.array(data.select("features").collect()).squeeze()
        self.labels = np.array(data.select("label").collect()).squeeze()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        features = torch.FloatTensor(self.features[idx])
        label = torch.LongTensor([self.labels[idx]]).squeeze()
        return features, label

In [52]:
class ClassificationNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(ClassificationNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 2100)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.01)
        self.fc2 = nn.Linear(2100, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc3 = nn.Linear(512, num_classes)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [21]:
preprocessing_pipeline = Pipeline(stages=[mcc_indexer, assembler])
preprocessed_data = preprocessing_pipeline.fit(data).transform(data)
train_data, test_data = preprocessed_data.randomSplit([0.8, 0.2], seed=42)

class SparkDataset(Dataset):
    def __init__(self, data):
        self.features = np.array(data.select("features").collect()).squeeze()
        self.labels = np.array(data.select("label").collect()).squeeze()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        features = torch.FloatTensor(self.features[idx])
        label = torch.LongTensor([self.labels[idx]]).squeeze()
        return features, label

train_dataset = SparkDataset(train_data)
test_dataset = SparkDataset(test_data)

                                                                                

In [45]:
input_size = len(feature_cols)
num_classes = data.select("label").distinct().count()

In [60]:
model = ClassificationNN(input_size, num_classes)

In [57]:
batch_size = 128
num_epochs = 10
learning_rate = 0.001

In [48]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [61]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [62]:
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {100 * correct / total:.2f}%')


Epoch [1/10], Accuracy: 16.45%
Epoch [2/10], Accuracy: 16.78%
Epoch [3/10], Accuracy: 16.80%
Epoch [4/10], Accuracy: 16.77%
Epoch [5/10], Accuracy: 16.82%
Epoch [6/10], Accuracy: 16.81%
Epoch [7/10], Accuracy: 16.87%
Epoch [8/10], Accuracy: 16.81%
Epoch [9/10], Accuracy: 16.67%
Epoch [10/10], Accuracy: 16.85%


In [67]:
from pyspark.sql.functions import pandas_udf, col, struct 
from pyspark.sql.types import DoubleType

def predict_batch(batch_iter):
    model.eval()
    predictions = []
    for features, _ in batch_iter:
        with torch.no_grad():
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            predictions.extend(preds.numpy())
    return pd.Series(predictions)


predict_udf = pandas_udf(predict_batch, returnType=DoubleType())

final_predictions = test_data.withColumn(
    "nn_prediction",
    predict_udf(struct(*feature_cols))  
    
)

In [70]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

final_predictions = final_predictions.withColumnRenamed("nn_prediction", "prediction")

In [71]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

In [77]:
import pyarrow
print(pyarrow.__version__) 

6.0.1


In [None]:
accuracy = evaluator_accuracy.evaluate(final_predictions)
f1 = evaluator_f1.evaluate(final_predictions)

print("Neural Network Results:")
print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1}")

In [23]:
spark.stop()

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import f1_score, accuracy_score

In [49]:

class TabularResNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.block1 = self._make_residual_block(input_size, 512)
        self.block2 = self._make_residual_block(512, 256)
        self.block3 = self._make_residual_block(256, 128)
        self.fc = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.3)

    def _make_residual_block(self, in_features, out_features):
        return nn.Sequential(
            nn.Linear(in_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU(),
            nn.Linear(out_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.dropout(x)
        x = self.block3(x)
        return self.fc(x)


class SparkDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]



In [50]:
input_size = len(feature_cols) 
num_classes = data.select("label").distinct().count()
model = TabularResNet(input_size, num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

                                                                                

In [55]:
# def prepare_data(df):
#     features = np.stack(df.select("features").toPandas()['features'].apply(lambda x: x.toArray()))
#     labels = df.select("label").rdd.flatMap(lambda x: x).collect()
#     return SparkDataset(features, labels)

# train_dataset = prepare_data(train_data)
# test_dataset = prepare_data(test_data)

preprocessing_pipeline = Pipeline(stages=[mcc_indexer, assembler])
preprocessed_data = preprocessing_pipeline.fit(data).transform(data)
train_data, test_data = preprocessed_data.randomSplit([0.8, 0.2], seed=42)

class SparkDataset(Dataset):
    def __init__(self, data):
        self.features = np.array(data.select("features").collect()).squeeze()
        self.labels = np.array(data.select("label").collect()).squeeze()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        features = torch.FloatTensor(self.features[idx])
        label = torch.LongTensor([self.labels[idx]]).squeeze()
        return features, label

train_dataset = SparkDataset(train_data)
test_dataset = SparkDataset(test_data)

                                                                                

In [42]:
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512)

In [54]:
def train_model(model, train_loader, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

train_model(model, train_loader)

Epoch 1/20, Loss: 4.8850
Epoch 2/20, Loss: 4.8808
Epoch 3/20, Loss: 4.8779
Epoch 4/20, Loss: 4.8727
Epoch 5/20, Loss: 4.8666
Epoch 6/20, Loss: 4.8685
Epoch 7/20, Loss: 4.8588
Epoch 8/20, Loss: 4.8605
Epoch 9/20, Loss: 4.8528
Epoch 10/20, Loss: 4.8537
Epoch 11/20, Loss: 4.8497
Epoch 12/20, Loss: 4.8400
Epoch 13/20, Loss: 4.8422
Epoch 14/20, Loss: 4.8351
Epoch 15/20, Loss: 4.8367
Epoch 16/20, Loss: 4.8262
Epoch 17/20, Loss: 4.8266
Epoch 18/20, Loss: 4.8223
Epoch 19/20, Loss: 4.8158
Epoch 20/20, Loss: 4.8140


In [71]:
import numpy as np

In [72]:
print("NumPy version:", np.__version__)

NumPy version: 2.2.2


In [70]:
!!pip install --force-reinstall numpy

['Defaulting to user installation because normal site-packages is not writeable',
 'Collecting numpy',
 '  Using cached numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl (14.8 MB)',
 'Installing collected packages: numpy',
 'Successfully installed numpy-1.19.5']

In [56]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            preds = output.argmax(dim=1)
            
            
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(target.cpu().tolist())
    
    
    correct = sum(1 for p, l in zip(all_preds, all_labels) if p == l)
    accuracy = correct / len(all_labels)
    
    
    unique_labels = set(all_labels)
    f1_scores = []
    for label in unique_labels:
        TP = sum((p == label) and (l == label) for p, l in zip(all_preds, all_labels))
        FP = sum((p == label) and (l != label) for p, l in zip(all_preds, all_labels))
        FN = sum((p != label) and (l == label) for p, l in zip(all_preds, all_labels))
        
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)
    
    avg_f1 = sum(f1_scores) / len(f1_scores)
    
    return accuracy, avg_f1


accuracy, f1 = evaluate_model(model, test_loader)
print(f"Test Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")

Test Accuracy: 0.1656, F1-Score: 0.0002


In [57]:
spark.stop()