<a href="https://colab.research.google.com/github/dev0419/BDA_Lab/blob/main/Lab-3/Entity_Resolution_L3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Develop a PySpark script to clean and preprocess data before performing entity resolution. Include steps like tokenization and normalization.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import Tokenizer, Word2Vec

spark = SparkSession.builder.appName('res').getOrCreate()

data = spark.read.csv('/content/data', inferSchema=True, header=True, nullValue='?')

data = data.fillna({'id_1': '', 'id_2': ''})

tokenizer_id_1 = Tokenizer(inputCol='id_1', outputCol='id_1_tokens')
data = tokenizer_id_1.transform(data)

tokenizer_id_2 = Tokenizer(inputCol='id_2', outputCol='id_2_tokens')
data = tokenizer_id_2.transform(data)

word2vec_id_1 = Word2Vec(vectorSize=5, minCount=0, inputCol='id_1_tokens', outputCol='id_1_vec')
model_id_1 = word2vec_id_1.fit(data)
data = model_id_1.transform(data)

word2vec_id_2 = Word2Vec(vectorSize=5, minCount=0, inputCol='id_2_tokens', outputCol='id_2_vec')
model_id_2 = word2vec_id_2.fit(data)
data = model_id_2.transform(data)

data.show()

+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+-----------+-----------+--------------------+--------------------+
| id_1| id_2|cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|id_1_tokens|id_2_tokens|            id_1_vec|            id_2_vec|
+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+-----------+-----------+--------------------+--------------------+
| 3148| 8326|           1|           0|           1|           0|      1|     1|     1|     1|      1|    true|     [3148]|     [8326]|[0.02116392925381...|[-0.0531290657818...|
|14055|94934|           1|           0|           1|           0|      1|     1|     1|     1|      1|    true|    [14055]|    [94934]|[-0.0012151598930...|[-0.0749845355749...|
|33948|34740|           1|           0|           1|           0|      1|     1|     1|     1|      1|    true

2. Implement a PySpark program that computes similarity scores between records using a chosen similarity metric.

In [None]:
from pyspark.sql.functions import expr
misses = data[data['is_match'] == 'false']
matches = data[data['is_match'] == 'true']
good_features = ["cmp_lname_c1","cmp_plz","cmp_by","cmp_bd","cmp_bm"]
sum_expression = " + ".join(good_features)
scored = data.fillna(0,subset = good_features).withColumn('score',expr(sum_expression)).select('score','is_match')
scored.show()

+-----+--------+
|score|is_match|
+-----+--------+
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
+-----+--------+
only showing top 20 rows



3. Implement a PySpark program to evaluate the precision, recall, and F1-score of an entity resolution model

In [None]:
from pyspark.sql.functions import col
def calculate_metrics(scored, threshold):
    tp = scored.filter((col('score') >= threshold) & (col('is_match') == 'true')).count()
    fp = scored.filter((col('score') >= threshold) & (col('is_match') == 'false')).count()
    fn = scored.filter((col('score') < threshold) & (col('is_match') == 'true')).count()

    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, recall, f1_score

threshold = 4.0

precision, recall, f1_score = calculate_metrics(scored, threshold)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

Precision: 0.9703831132601822
Recall: 0.9974193548387097
F1 Score: 0.9837155044422973
