<a href="https://colab.research.google.com/github/Lonely52Hz/ID2223_Scalable_Machine_Learning_and_Deep_Learning/blob/main/ID2223_Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# set environment in colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 49.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=50e584181db8b2535232175ccd8feedf1999162d96350c7b9757e56d7ad9246d
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("Colab").config("spark.ui.port", "4050").getOrCreate()
from pyspark.sql.functions import *
#from pyspark.sql.types import *

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [5]:
# read files
train = spark.read.csv('drive/MyDrive/ID2223_File/stsbenchmark/sts-train.csv', sep = '\t', inferSchema = True)\
     .withColumnRenamed('_c0', 'genre').withColumnRenamed('_c1', 'file').withColumnRenamed('_c2', 'year')\
     .withColumnRenamed('_c3', 'index').withColumnRenamed('_c4', 'score')\
     .withColumnRenamed('_c5', 'sentenceA').withColumnRenamed('_c6', 'sentenceB')
test = spark.read.csv('drive/MyDrive/ID2223_File/stsbenchmark/sts-test.csv', sep = '\t', inferSchema = True)\
     .withColumnRenamed('_c0', 'genre').withColumnRenamed('_c1', 'file').withColumnRenamed('_c2', 'year')\
     .withColumnRenamed('_c3', 'index').withColumnRenamed('_c4', 'score')\
     .withColumnRenamed('_c5', 'sentenceA').withColumnRenamed('_c6', 'sentenceB')
dev = spark.read.csv('drive/MyDrive/ID2223_File/stsbenchmark/sts-dev.csv', sep = '\t', inferSchema = True)\
     .withColumnRenamed('_c0', 'genre').withColumnRenamed('_c1', 'file').withColumnRenamed('_c2', 'year')\
     .withColumnRenamed('_c3', 'index').withColumnRenamed('_c4', 'score')\
     .withColumnRenamed('_c5', 'sentenceA').withColumnRenamed('_c6', 'sentenceB')

train.show(5)
train.printSchema()

+-------------+------+--------+-----+-----+--------------------+--------------------+
|        genre|  file|    year|index|score|           sentenceA|           sentenceB|
+-------------+------+--------+-----+-----+--------------------+--------------------+
|main-captions|MSRvid|2012test|    1|  5.0|A plane is taking...|An air plane is t...|
|main-captions|MSRvid|2012test|    4|  3.8|A man is playing ...|A man is playing ...|
|main-captions|MSRvid|2012test|    5|  3.8|A man is spreadin...|A man is spreadin...|
|main-captions|MSRvid|2012test|    6|  2.6|Three men are pla...|Two men are playi...|
|main-captions|MSRvid|2012test|    9| 4.25|A man is playing ...|A man seated is p...|
+-------------+------+--------+-----+-----+--------------------+--------------------+
only showing top 5 rows

root
 |-- genre: string (nullable = true)
 |-- file: string (nullable = true)
 |-- year: string (nullable = true)
 |-- index: integer (nullable = true)
 |-- score: double (nullable = true)
 |-- sentenc

In [6]:
# map [0, 5] to [-1, 1]
train = train.withColumn('score', col('score')/2.5 - 1)
test = test.withColumn('score', col('score')/2.5 - 1)
dev = dev.withColumn('score', col('score')/2.5 - 1)

train.show(5)
train.describe(['score']).show()

+-------------+------+--------+-----+--------------------+--------------------+--------------------+
|        genre|  file|    year|index|               score|           sentenceA|           sentenceB|
+-------------+------+--------+-----+--------------------+--------------------+--------------------+
|main-captions|MSRvid|2012test|    1|                 1.0|A plane is taking...|An air plane is t...|
|main-captions|MSRvid|2012test|    4|                0.52|A man is playing ...|A man is playing ...|
|main-captions|MSRvid|2012test|    5|                0.52|A man is spreadin...|A man is spreadin...|
|main-captions|MSRvid|2012test|    6|0.040000000000000036|Three men are pla...|Two men are playi...|
|main-captions|MSRvid|2012test|    9|                 0.7|A man is playing ...|A man seated is p...|
+-------------+------+--------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows

+-------+-------------------+
|summary|              score|
+-----

In [7]:
# find rows with missing value
train.select([count(when(col(c).isNull(), c)).alias(c) for c in train.columns]).show()
test.select([count(when(col(c).isNull(), c)).alias(c) for c in test.columns]).show()
dev.select([count(when(col(c).isNull(), c)).alias(c) for c in dev.columns]).show()

+-----+----+----+-----+-----+---------+---------+
|genre|file|year|index|score|sentenceA|sentenceB|
+-----+----+----+-----+-----+---------+---------+
|    0|   0|   0|    0|    0|        0|        6|
+-----+----+----+-----+-----+---------+---------+

+-----+----+----+-----+-----+---------+---------+
|genre|file|year|index|score|sentenceA|sentenceB|
+-----+----+----+-----+-----+---------+---------+
|    0|   0|   0|    0|    0|        0|        3|
+-----+----+----+-----+-----+---------+---------+

+-----+----+----+-----+-----+---------+---------+
|genre|file|year|index|score|sentenceA|sentenceB|
+-----+----+----+-----+-----+---------+---------+
|    0|   0|   0|    0|    0|        0|        3|
+-----+----+----+-----+-----+---------+---------+



In [8]:
# delete rows with missing value
train = train.na.drop()
test = test.na.drop()
dev = dev.na.drop()

train.select([count(when(col(c).isNull(), c)).alias(c) for c in train.columns]).show()
train.show(5)

+-----+----+----+-----+-----+---------+---------+
|genre|file|year|index|score|sentenceA|sentenceB|
+-----+----+----+-----+-----+---------+---------+
|    0|   0|   0|    0|    0|        0|        0|
+-----+----+----+-----+-----+---------+---------+

+-------------+------+--------+-----+--------------------+--------------------+--------------------+
|        genre|  file|    year|index|               score|           sentenceA|           sentenceB|
+-------------+------+--------+-----+--------------------+--------------------+--------------------+
|main-captions|MSRvid|2012test|    1|                 1.0|A plane is taking...|An air plane is t...|
|main-captions|MSRvid|2012test|    4|                0.52|A man is playing ...|A man is playing ...|
|main-captions|MSRvid|2012test|    5|                0.52|A man is spreadin...|A man is spreadin...|
|main-captions|MSRvid|2012test|    6|0.040000000000000036|Three men are pla...|Two men are playi...|
|main-captions|MSRvid|2012test|    9|     

In [9]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, models, losses, InputExample

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 5.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 28.4 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 43.5 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 515 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |█████████████████████

In [10]:
word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),\
                               pooling_mode_mean_tokens=True,\
                               pooling_mode_cls_token=False,\
                               pooling_mode_max_tokens=False)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [11]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [12]:
loss = losses.CosineSimilarityLoss(model)

In [19]:
from torch.utils.data import DataLoader

example = []
sa=[]
sb=[]
scor=[]
df_train=train.toPandas()
# print(train)
# print(df_train)
for row in df_train.iloc:
  row=row.values.tolist()
  #print(row)
  
  inp_example = InputExample(texts=[row[5], row[6]], label=float(row[4]))
  sa.append(row[5])
  sb.append(row[6])
  scor.append(row[4])
  example.append(inp_example)
train_dataloader = DataLoader(example, shuffle=True, batch_size=16)

In [None]:
from sentence_transformers import evaluation
evaluator = evaluation.EmbeddingSimilarityEvaluator(sa, sb, scor)

model.fit(train_objectives=[(train_dataloader, loss)], epochs=1, warmup_steps=100, evaluator=evaluator, evaluation_steps=500)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/359 [00:00<?, ?it/s]

In [25]:
# evaluation
from scipy import stats

In [32]:
from sentence_transformers import util
a=model.encode('A girl is styling her hair.')
b=model.encode('A girl is brushing her hair.')
cos_sim = util.pytorch_cos_sim(a, b)
print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: tensor([[0.3124]])
