# 0. Manual configuration during demo

In [None]:
from os import path

demo_input_file = path.join('data', 'sample_demo.csv')
demo_output_file = path.join('data', 'output-1c.csv')

# Set name of the column which will contain the prediction in output.csv
OUTPUT_COLUMN_NAME = 'output' # or 'score'? Ask TA.

# 1. Loading demo data

In [None]:
import pandas as pd

df = pd.read_csv(demo_input_file, delimiter='\t')

# Correct any mispellings like 'setence'
if 'setence1' in df.columns:
    df.rename(columns={'setence1': 'sentence1'}, inplace=True)
if 'setence2' in df.columns:
    df.rename(columns={'setence2': 'sentence2'}, inplace=True)

df['sentence1'] = df['sentence1'].astype(str)
df['sentence2'] = df['sentence2'].astype(str)
df

# 2. Load Model

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("model-task-1c")

# 3. Perform inference

In [None]:
from scipy.spatial.distance import cosine
import numpy as np

df_results = df.copy()
embeddings_col_1 = model.encode(df_results.sentence1, show_progress_bar=True)
embeddings_col_2 = model.encode(df_results.sentence2, show_progress_bar=True)

predictions = []

for embedding_1, embedding_2 in zip(embeddings_col_1, embeddings_col_2):
    cos_distance = cosine(embedding_1, embedding_2)
    cos_similarity = 1-cos_distance
    similarity_scaled = np.clip(cos_similarity * 5, 0, 5)
    predictions.append(round(similarity_scaled, 2))

df_results[OUTPUT_COLUMN_NAME] = predictions
df_results

In [None]:
df_results.to_csv(
    demo_output_file,
    sep='\t'      # Comment if pandas' default separator ',' should be used.
                  # '\t' is used in all provided Google Classroom data.
)