In [None]:
!pip install faiss-cpu
!python -m spacy download en_core_web_lg

In [None]:
!pip install emoji

In [106]:
import pandas as pd
import numpy as np
import json
import html
import emoji
import re
import spacy
import datasets
from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!curl -L -o ./yelp-dataset.zip https://www.kaggle.com/api/v1/datasets/download/yelp-dataset/yelp-dataset
!unzip /content/yelp-dataset.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4172M  100 4172M    0     0  94.2M      0  0:00:44  0:00:44 --:--:-- 56.4M
Archive:  /content/yelp-dataset.zip
  inflating: Dataset_User_Agreement.pdf  
  inflating: yelp_academic_dataset_business.json  
  inflating: yelp_academic_dataset_checkin.json  
  inflating: yelp_academic_dataset_review.json  
  inflating: yelp_academic_dataset_tip.json  
  inflating: yelp_academic_dataset_user.json  


In [9]:
data = []
with open('/content/yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df.head()

In [None]:
# df = pd.read_json('/content/yelp_academic_dataset_review.json', lines = True)
df = datasets.load_dataset('json', data_files = '/content/yelp_academic_dataset_review.json')

In [6]:
df['train'].select(range(10))[:1]

{'review_id': ['KU_O5udG6zpxOg-VcAEodg'],
 'user_id': ['mh_-eMZ6K5RLWhZyISBhwA'],
 'business_id': ['XQfwVwDr-v0ZS3_CbbE5Xw'],
 'stars': [3.0],
 'useful': [0],
 'funny': [0],
 'cool': [0],
 'text': ["If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker."],
 'date': [datetime.datetime(2018, 7, 7, 22, 9, 11)]}

In [19]:
from spacy.lang.en.stop_words import STOP_WORDS
stops = set(STOP_WORDS) - {'no', 'not', 'never', 'what', 'when', 'why', 'who', 'where', 'whould'}
nlp = spacy.load('en_core_web_lg', disable = ['parser', 'ner'])

def text_cleaning(text):
    text = html.unescape(text)
    text = emoji.replace_emoji(text, '')
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\s+', ' ', text)

    text = ' '.join([t.lemma_.lower() for t in nlp(text) if t not in stops and not t.is_punct])

    return text

text_cleaning('Hello check www.google.com    <p>      hi')

'hello check hi'

In [167]:
df_small = df['train'].select(range(1000)) #.train_test_split(test_size = 0.2)
df_small

Dataset({
    features: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'],
    num_rows: 1000
})

In [38]:
# df_small.select(range(2))[1]['text']
df_small.info

DatasetInfo(description='', citation='', homepage='', license='', features={'review_id': Value('string'), 'user_id': Value('string'), 'business_id': Value('string'), 'stars': Value('float64'), 'useful': Value('int64'), 'funny': Value('int64'), 'cool': Value('int64'), 'text': Value('string'), 'date': Value('timestamp[s]')}, post_processed=None, supervised_keys=None, builder_name='json', dataset_name='json', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=4822092117, num_examples=6990280, shard_lengths=[725716, 728696, 728794, 726301, 726344, 728691, 726713, 731302, 729548, 438175], dataset_name='json')}, download_checksums={'/content/yelp_academic_dataset_review.json': {'num_bytes': 5341868833, 'checksum': None}}, download_size=5341868833, post_processing_size=None, dataset_size=4822092117, size_in_bytes=10163960950)

In [168]:
df_small = df_small.filter(lambda row: len(row['text'].split()) > 30)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [126]:
df_small = df_small.map(
    lambda row: {'cleaned': [text_cleaning(item) for item in row['text']]}, batched = True
)

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [127]:
checkpoint = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = SentenceTransformer(checkpoint)

In [None]:
# tokenizer = AutoTokenizer.from_pretrained('model_name')
# def tokenize(row):
#     return tokenizer(
#         row['text'],
#         padding = True,
#         truncation = True,
#         max_length = 256,
#         return_tensors = 'pt',
#     )

In [129]:
df_small

DatasetDict({
    train: Dataset({
        features: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date', 'cleaned'],
        num_rows: 7
    })
    test: Dataset({
        features: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date', 'cleaned'],
        num_rows: 2
    })
})

In [169]:
df_small = df_small.map(
    lambda row: {'tokenized': tokenizer.encode(row['text'])},
    batched = True,
)

Map:   0%|          | 0/846 [00:00<?, ? examples/s]

In [131]:
def extract_feature(row):
    return np.concatenate([
        row['tokenized'],
        [row['cool']],
        [row['funny']],
        [row['useful']]
    ])

In [132]:
df_small = df_small.map(lambda row: {'features': extract_feature(row)})

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [170]:
texts = []
tokens = []
for item in df_small:
    texts.append(item['text'])
    tokens.append(item['tokenized'])

In [171]:
tokens = np.array(tokens)

In [181]:
import faiss

d = tokens.shape[1]
index = faiss.IndexFlatL2(d)
index.add(tokens)

sentence = ['it was a good place to stay for even more than 1 night. staffs were friendly and like the hotel']
querry = tokenizer.encode(sentence)
scores, idx = index.search(querry, 3)

In [182]:
for i in idx[0]:
    print(texts[i])
    print('-' * 50)

Its a hotel and you can stay there.

After a couple of stays here I find it is not bad. 

Its definitely more of a business traveler kind of place than a tourist kind of place.

Pros
The rooms have the essentials. The beds are quite comfortable.
The cafe isn't bad although it is a little foggy as to what comes with the room. Apparently its one entree and one beverage. Not a bad deal.

Cons
Hallways are a bit shaggy. NBD since I am not staying in the hallway.
Room cleaning dropped the ball on my last stay. The room was clean, but I found a bag from BK in my mini fridge ... eeeew!
--------------------------------------------------
Great place to stay if you want to be on Bourbon street.  The staff is really hospitable and didn't fail me on food choices.  The room is nice and big.  However, I stayed in room 263 and thought it was perfect because it was not a room that faced Conti.  I was wrong because it was closer to the vending and ice machines.  The door/wall was thin enough that I hea

## Translation

In [62]:
checkpoint = "facebook/m2m100_418M"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [73]:
df_small.select(range(2))[1]['text']

"I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle. From the nice, clean space and amazing bikes, to the welcoming and motivating instructors, every class is a top notch work out.\n\nFor anyone who struggles to fit workouts in, the online scheduling system makes it easy to plan ahead (and there's no need to line up way in advanced like many gyms make you do).\n\nThere is no way I can write this review without giving Russell, the owner of Body Cycle, a shout out. Russell's passion for fitness and cycling is so evident, as is his desire for all of his clients to succeed. He is always dropping in to classes to check in/provide encouragement, and is open to ideas and recommendations from anyone. Russell always wears a smile on his face, even when he's kicking your butt in class!"

In [74]:
tokenizer.src_lang = 'en'
encoded = tokenizer(df_small.select(range(2))[1]['cleaned'], return_tensors = 'pt')

decoded = model.generate(**encoded, forced_bos_token_id = tokenizer.lang_code_to_id['fa'])

In [75]:
tokenizer.decode(decoded[0])

'</s> __fa__ من در طول سال بسیاری از کلاس های چرخش و هیچ چیز در مقایسه با کلاس در چرخه بدن از فضای تمیز و دوچرخه شگفت انگیز به خوش آمدید و انگیزه مربی هر کلاس برای هر کسی که مبارزه برای تناسب تمرین در سیستم برنامه ریزی آنلاین آن را آسان برای برنامه ریزی به جلو و هیچ نیازی به ترتیب راه به جلو به عنوان بسیاری از ورزشگاه شما انجام هیچ راه من می توانم این بررسی بدون دادن russell صاحب چرخه بدن یک فریاد از russell شور و شوق خود را برای تناسب اندام و دوچرخه سواری به طوری آشکار است که آرزوی خود را برای همه مشتری خود را به موفقیت او همیشه در کلاس برای چک کردن در ارائه تشویق و باز به ایده و توصیه از هر کس russell همیشه لبخند بر روی صورت خود را حتی زمانی که او در کلاس</s>'

## Question answering

In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

In [89]:
context = "SpaceX was founded by Elon Musk in 2002 in California."
question = "Who founded SpaceX?"

result = qa_pipeline(question=question, context=context)
print(result)

{'score': 0.9980174140036979, 'start': 22, 'end': 31, 'answer': 'Elon Musk'}


In [80]:
!pip install pypdf



In [85]:
from pypdf import PdfReader

reader = PdfReader('/content/Dataset_User_Agreement.pdf')

text = ''
for page in reader.pages:
    text += page.extract_text()

In [88]:
context = text
question = 'What happen if we are not agree with term of use'

result = qa_pipeline(question=question, context=context)
print(result)

{'score': 0.7246695160865784, 'start': 877, 'end': 911, 'answer': 'you may not access or use the Data'}
