In [1]:
!pip install git+https://github.com/huggingface/transformers -q

In [2]:
!pip install sentencepiece



In [3]:
!pip freeze | grep transformers

transformers @ git+https://github.com/huggingface/transformers@9e87618f2be1401df55c36ad726629ae201e8e4d


In [4]:
!pip install langchain
!pip install unstructured[pdf]

Collecting langchain
  Downloading langchain-0.0.325-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.52 (from langchain)
  Downloading langsmith-0.0.53-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonpatch, langsmith, langchain
  Attempting uninstall: jsonpatch
    Found existing installation: jsonpatch 1.32
    Uninstalling jsonpatch-1.32:
      Successfully uninstalled jsonpatch-1.32
Successfully installed jsonpatch-1.33 langchain-0.0.325 langsmith-0.0.53
Collecting unstructured[pdf]
  Downloading unstructured-0.10.27-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [5]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")



Downloading (…)lve/main/config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [6]:
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import logging
import os

In [7]:
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader


ROOT_DIRECTORY = "../input/abcdef"

SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"

INGEST_THREADS = os.cpu_count() or 8

DOCUMENT_MAP = {
    ".txt": TextLoader,
    ".md": UnstructuredMarkdownLoader,
    ".py": TextLoader,
    ".pdf": UnstructuredFileLoader,
    ".csv": CSVLoader,
    ".xls": UnstructuredExcelLoader,
    ".xlsx": UnstructuredExcelLoader,
    ".docx": Docx2txtLoader,
    ".doc": Docx2txtLoader,
}


In [8]:
import nltk

In [9]:
def file_log(logentry):
   file1 = open("file_ingest.log","a")
   file1.write(logentry + "\n")
   file1.close()
   print(logentry + "\n")

def load_single_document(file_path: str) -> Document:
    # Loads a single document from a file path
    try:
       file_extension = os.path.splitext(file_path)[1]
       loader_class = DOCUMENT_MAP.get(file_extension)
       if loader_class:
           file_log(file_path + ' loaded.')
           loader = loader_class(file_path)
       else:
           file_log(file_path + ' document type is undefined.')
           raise ValueError("Document type is undefined")
       return loader.load()[0]
    except Exception as ex:
       file_log('%s loading error: \n%s' % (file_path, ex))
       return None


def load_document_batch(filepaths):
    logging.info("Loading document batch")
    # create a thread pool
    with ThreadPoolExecutor(len(filepaths)) as exe:
        # load files
        futures = [exe.submit(load_single_document, name) for name in filepaths]
        # collect data
        data_list = [future.result() for future in futures]
        # return data and file paths
        return (data_list, filepaths)


def load_documents(source_dir: str) -> list[Document]:
    # Loads all documents from the source documents directory, including nested folders
    paths = []
    for root, _, files in os.walk(source_dir):
        for file_name in files:
            print("file_name")
            print('Importing: ' + file_name)
            file_extension = os.path.splitext(file_name)[1]
            source_file_path = os.path.join(root, file_name)
            if file_extension in DOCUMENT_MAP.keys():
                paths.append(source_file_path)

    # Have at least one worker and at most INGEST_THREADS workers
    n_workers = min(INGEST_THREADS, max(len(paths), 1))
    chunksize = round(len(paths) / n_workers)
    docs = []
    with ProcessPoolExecutor(n_workers) as executor:
        futures = []
        # split the load operations into chunks
        for i in range(0, len(paths), chunksize):
            # select a chunk of filenames
            filepaths = paths[i : (i + chunksize)]
            # submit the task
            try:
               future = executor.submit(load_document_batch, filepaths)
            except Exception as ex:
               file_log('executor task failed: %s' % (ex))
               future = None
            if future is not None:
               futures.append(future)
        # process all results
        for future in as_completed(futures):
            # open the file and load the data
            try:
                contents, _ = future.result()
                docs.extend(contents)
            except Exception as ex:
                file_log('Exception: %s' % (ex))

    return docs

In [10]:
docs = load_documents(SOURCE_DIRECTORY)
import re

def split_text(text, max_words=500):
    sentences = re.split('(?<=[.!?]) +', text)
    chunk = ""
    chunks = []
    count = 0
    for sentence in sentences:
        words = sentence.split()
        if count + len(words) > max_words:
            chunks.append(chunk.strip())
            chunk = sentence
            count = len(words)
        else:
            chunk += " " + sentence
            count += len(words)
    chunks.append(chunk.strip())
    return chunks

file_name
Importing: AFFAIRE C.P. ET M.N. c. FRANCE.pdf
../input/abcdef/SOURCE_DOCUMENTS/AFFAIRE C.P. ET M.N. c. FRANCE.pdf loaded.



In [11]:
chunks = split_text(docs[0].page_content)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} has {len(chunk.split())} words.")

Chunk 1 has 500 words.
Chunk 2 has 493 words.
Chunk 3 has 353 words.
Chunk 4 has 312 words.
Chunk 5 has 411 words.
Chunk 6 has 492 words.
Chunk 7 has 481 words.
Chunk 8 has 443 words.
Chunk 9 has 420 words.
Chunk 10 has 472 words.
Chunk 11 has 499 words.
Chunk 12 has 500 words.
Chunk 13 has 498 words.
Chunk 14 has 499 words.
Chunk 15 has 479 words.
Chunk 16 has 498 words.
Chunk 17 has 469 words.
Chunk 18 has 472 words.


In [12]:
tokenizer.src_lang = "fr_XX"

In [13]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt").to('cuda')

In [14]:
def translate(chunk):
  model_inputs = tokenizer(chunk, return_tensors="pt").to('cuda')
  generated_tokens = model.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
  )
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
  return translation

In [15]:
#translate(chunks[0])

In [16]:
#translated = ""

#for chunk in chunks:
 # translated += translate(chunk)[0]

In [17]:
#translated

In [18]:
!pip install reportlab 
from reportlab.pdfgen import canvas

def text_to_pdf(text, filename):
    # Create a new PDF with Reportlab
    c = canvas.Canvas(filename)

    # Insert the text into the PDF
    c.drawString(100, 750, text)

    # Save the PDF
    c.save()


Collecting reportlab
  Downloading reportlab-4.0.6-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: reportlab
Successfully installed reportlab-4.0.6


In [19]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet

def text_to_pdf(text, filename):
    # Create a new PDF with Reportlab
    doc = SimpleDocTemplate(filename, pagesize=letter)

    # Prepare the text for the PDF
    styles = getSampleStyleSheet()
    text = Paragraph(text, styles['BodyText'])

    # Build the PDF
    doc.build([text])


In [20]:
def convert(src_ln, file_data):
    tokenizer.src_lang = src_ln
    # docs = load_documents(src)

    for key, value in file_data.items():
        chunks = split_text(value)
        translated = ""
        for chunk in chunks:
          translated += translate(chunk)[0]
        text_to_pdf(translated, key)
        print(f"Translated {key} pdf")
        
        


In [21]:
#convert('fr_XX', SOURCE_DIRECTORY)

In [22]:
!pip install -qqq python-multipart
!pip install -qqq fastapi
!pip install -qqq uvicorn
!pip install -qqq pydantic
!pip install -qqq pypi-json
!pip install -qqq pyngrok
!pip install -qqq nest-asyncio
!pip install -qqq httpx

In [23]:
from fastapi import FastAPI, UploadFile, File, Form
from pydantic import BaseModel
import json
import uvicorn
from pyngrok import ngrok
from fastapi.middleware.cors import CORSMiddleware
import nest_asyncio

In [24]:
app = FastAPI()
origins = ["*"]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [25]:
@app.get('/')
async def root():
    return {'hello': 'world'}


In [26]:
from typing import List

In [27]:
!pip install pdfplumber



In [28]:
import pdfplumber
import boto3


s3 = boto3.client('s3', aws_access_key_id='ACCESS-ID', aws_secret_access_key='ACCESS-KEY')

In [29]:
import random

In [30]:

@app.post("/uploadfiles")
async def create_upload_files(files: List[UploadFile] = File(...), email: str = Form(...), lang: str = Form(...)):
    for file in files:
        if file.filename.split(".")[-1] != "pdf":
            raise HTTPException(status_code=400, detail="Invalid file type. Only PDFs are accepted.")
    file_data = {}
    random_data = ""
    for file in files:
        text = ""
        with pdfplumber.open(file.file) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        file_data[email+file.filename] = text
        if random_data=="":
            start = random.randint(0, len(text))
            end = random.randint(start, len(text))
            random_data = text[start:end]
            
    convert(lang, file_data)
    for filename, text in file_data.items():
        print("uploaded file!")
        res=s3.upload_file(Filename="/kaggle/working/"+filename, Bucket='railrakshak', Key=filename)
        print(res)
    
    for filename in file_data.keys():
        if os.path.exists(filename):
            os.remove(filename)
        else:
            print(f"The file {filename} does not exist")
    return {"SUCCESS":"PDF CREATED"}
        
        

file_ingest.log

In [31]:
!ngrok authtoken 2U9ULE2B6hpuRTvOv1qFYffk0NB_2QkfbTK5QhKxFfB3oCXPv

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml                                      


In [None]:
ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

INFO:     Started server process [32]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://6952-34-83-51-35.ngrok-free.app
INFO:     136.232.1.174:0 - "GET / HTTP/1.1" 200 OK
Translated kenneth@mail.comAFFAIRE AVCIOgLU c. TÜRKiYE.pdf pdf
uploaded file!
None
INFO:     2409:4040:6e87:379c:25fc:951f:850e:287e:0 - "POST /uploadfiles HTTP/1.1" 200 OK
