# FAISS for NMT Bitext Tutorial

## Mount Drive & Files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import sys
my_path = '/content/notebooks'
os.symlink('/content/drive/MyDrive/AllforOne/package_collection', my_path)
sys.path.insert(0, my_path)

Mounted at /content/drive


In [None]:
!nvidia-smi

Tue Mar 28 03:08:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 1. Install FAISS

In [None]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


## 2. Install LASER

In [None]:
!pip install laserembeddings
!python -m laserembeddings download-models

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m859.8/859.8 KB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting mock
  Downloading mock-5.0.1-py3-none-any.whl (30 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-py3-

## 3. Get Bitext

In [None]:
cd /content/drive/MyDrive/AllforOne/Lecture/FAISS

/content/drive/MyDrive/AllforOne/Lecture/FAISS


In [None]:
cat src.ko

집에 가고 싶다
하지만 이미 집에 있다

In [None]:
cat tgt.en

I want to go home
I love my home
But I am already in home

### 3-1. Query : Key = Source sentence : Target Document

In [None]:
import numpy as np
import faiss
from laserembeddings import Laser

# bitext 데이터 로드
with open('tgt.en', 'r', encoding='utf8') as f:
    bitext = f.readlines()

# LASER 모델 로드
laser = Laser()

# 문장 임베딩 생성
embeddings = laser.embed_sentences(bitext, lang='en')

# Faiss 인덱스 생성
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeddings)

# 검색할 쿼리 문장
query = '집에 가고 싶다'

# 쿼리 문장의 임베딩 생성
query_emb = laser.embed_sentences([query], lang='ko')[0]

# 검색
D, I = index.search(np.array([query_emb]), k=1)

# 검색된 결과 출력
print("Query: ", query)
print("Key Sentence: ", bitext[I[0][0]])
print("Index: ", I[0][0])

Query:  집에 가고 싶다
Key Sentence:  I want to go home

Index:  0


### 3-2. Query : Key = Source Document : Target Document

In [None]:
import numpy as np
import faiss
from laserembeddings import Laser

# src.txt와 tgt.txt 데이터 로드
with open('src.ko', 'r', encoding='utf8') as f:
    src_lines = f.readlines()
    
with open('tgt.en', 'r', encoding='utf8') as f:
    tgt_lines = f.readlines()

# LASER 모델 로드
laser = Laser()

# 문장 임베딩 생성
src_embeddings = laser.embed_sentences(src_lines, lang='ko')
tgt_embeddings = laser.embed_sentences(tgt_lines, lang='en')

# Faiss 인덱스 생성
d = src_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(tgt_embeddings)

# src.txt의 각 문장에서 가장 유사한 문장 검색
for i, src_line in enumerate(src_lines):
    # 쿼리 문장의 임베딩 생성
    query_emb = laser.embed_sentences([src_line], lang='en')[0]
    
    # 검색
    D, I = index.search(np.array([query_emb]), k=1)
    
    # 검색된 결과 출력
    print("Source sentence: ", src_line.strip())
    print("Target sentence: ", tgt_lines[I[0][0]].strip())
    print("Source index: ", i)
    print("Target index: ", I[0][0])
    print()


Source sentence:  집에 가고 싶다
Target sentence:  I want to go home
Source index:  0
Target index:  0

Source sentence:  하지만 이미 집에 있다
Target sentence:  But I am already in home
Source index:  1
Target index:  2

