In [72]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, trim, col, array_join, udf, when, lit

In [73]:
spark = SparkSession.builder.appName("Week3Practice").getOrCreate()

In [74]:
df1 = spark.read.csv('../../Week3/datasets/lab3_1_dataset.csv', header=True, inferSchema=True)
df2 = spark.read.csv('../../Week3/datasets/lab3_2_dataset.csv', header=True, inferSchema=True)

df = df1.union(df2)

In [75]:
df = df.na.drop("any")

In [76]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)



In [77]:
df = df.withColumn('name', lower(trim(col('name'))))

In [78]:
from pyspark.ml.feature import Tokenizer

In [79]:
tokenizer = Tokenizer(inputCol='name', outputCol='tokenization_name')
df_tokens = tokenizer.transform(df)

In [80]:
df_tokens.show()

+----+-------------------+--------------------+--------------------+
|  id|               name|             address|   tokenization_name|
+----+-------------------+--------------------+--------------------+
|1859|         nkev koquo|1087 Uxkgk St, Zw...|       [nkev, koquo]|
|1402|      awdeltx flswj|1313 Bysqtepcyb S...|    [awdeltx, flswj]|
|3503|      zvgk qmnjjfub|4451 Uymmthnzp St...|    [zvgk, qmnjjfub]|
|1009|     suaxud ybukuxq|4793 Fodikswks St...|   [suaxud, ybukuxq]|
| 276|    bdiolpnl iduwpc|3640 Hrfulh St, C...|  [bdiolpnl, iduwpc]|
|4600|       mrada gsxapz|7676 Sngqs St, Ut...|     [mrada, gsxapz]|
| 580|       hiwz zvsxqmr|1884 Guaxszlv St,...|     [hiwz, zvsxqmr]|
|4861|       wpk gfduhflq|7724 Etqegoijrf S...|     [wpk, gfduhflq]|
|1433|       udfzl woqmam|5479 Modamxtaxi S...|     [udfzl, woqmam]|
|1641|    mjympmb zovjoib|2881 Lmulkkwj St,...|  [mjympmb, zovjoib]|
|1248|bqplwedo neydmdiwgg|5237 Zhnthymzpq S...|[bqplwedo, neydmd...|
|3973|    vlqthxhj drtueb|5392 Qir

In [81]:
df_tokens.withColumn('tokenization_name', array_join(col('tokenization_name'), ',')).write.csv('tokenized_data')

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/Midesem Practice/Week3/tokenized_data already exists. Set mode as "overwrite" to overwrite the existing path. SQLSTATE: 42K04

In [82]:
def jaccard_similarity(tokens1, tokens2):
    set1 = set(str(tokens1))
    set2 = set(str(tokens2))

    union = set1.union(set2)
    intersection = set1.intersection(set2)

    if not union:
        return 0.0
    
    return float(len(intersection)) / len(union)

In [83]:
jaccard_udf = udf(jaccard_similarity)

In [84]:
df_pairs = df_tokens.alias('a').crossJoin(df_tokens.alias('b'))

In [85]:
df_sim = df_pairs.withColumn('jaccard_similarity_score', jaccard_udf(col('a.tokenization_name'), col('b.tokenization_name')))

In [86]:
df_sim.select(['a.id', 'b.id', 'jaccard_similarity_score']).show(10)

+----+----+------------------------+
|  id|  id|jaccard_similarity_score|
+----+----+------------------------+
|1859|1859|                     1.0|
|1859|1402|      0.2857142857142857|
|1859|3503|      0.5555555555555556|
|1859|1009|      0.4444444444444444|
|1859| 276|     0.42105263157894735|
|1859|4600|     0.23809523809523808|
|1859| 580|                    0.35|
|1859|4861|     0.42105263157894735|
|1859|1433|     0.42105263157894735|
|1859|1641|      0.3684210526315789|
+----+----+------------------------+
only showing top 10 rows


                                                                                

In [87]:
df_sim = df_sim.withColumn('label', when(col('a.id') == col('b.id'), lit(1)).otherwise(lit(0)))
df_sim = df_sim.withColumn('pred_label', when(col('jaccard_similarity_score') > 0.5, lit(1)).otherwise(lit(0)))

In [120]:
df_sim = df_sim.withColumn('TP', when((col('pred_label') == 1) & (col('label') == 1), lit(1)).otherwise(lit(0)))
df_sim = df_sim.withColumn('FP', when((col('pred_label') == 1) & (col('label') == 0), lit(1)).otherwise(lit(0)))
df_sim = df_sim.withColumn('FN', when((col('pred_label') == 0) & (col('label') == 1), lit(1)).otherwise(lit(0)))
df_sim = df_sim.withColumn('TN', when((col('pred_label') == 0) & (col('label') == 0), lit(1)).otherwise(lit(0)))

In [121]:
agg_result = df_sim.agg({"TP": "sum", "FP": "sum", "FN": "sum", "TN": "sum"}).collect()[0]

TP = agg_result["sum(TP)"]
FP = agg_result["sum(FP)"]
FN = agg_result["sum(FN)"]
TN = agg_result["sum(TN)"]


                                                                                

In [127]:
precision = TP/ (TP + FP)
recall = TP/ (TP + FN)
F1 = (2*precision*recall)/(precision + recall)

precision, recall, F1

(0.0016540165745975248, 1.0, 0.003302570642613388)