In [2]:
import pandas as pd
from docx import Document
import tabula
import json
import openpyxl
import csv
import PyPDF2
from transformers import pipeline, AutoTokenizer
import nltk
nltk.download("punkt")




def convert_docx_to_csv(docx_file, csv_file):
    doc = Document(docx_file)
    text_content = []
    for paragraph in doc.paragraphs:
        text_content.append(paragraph.text)
    ink = pd.DataFrame({'Text': text_content})
    ink.to_csv(csv_file, index=False)


def convert_pdf_to_csv(pdf_file, csv_file):
    pdf_text = ""
    with open(pdf_file, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            pdf_text += page.extract_text()


    lines = pdf_text.split('\n')


    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)


        for line in lines:
          csv_writer.writerow([line])



def convert_json_to_csv(json_file, csv_file):
    with open(json_file, 'r') as json_data:
        data = json.load(json_data)
    ink = pd.DataFrame(data)
    ink.to_csv(csv_file, index=False)


def convert_excel_to_csv(excel_file, csv_file):
    xls = openpyxl.load_workbook(excel_file)
    sheet = xls.active
    data = sheet.values
    ink = pd.DataFrame(data)
    ink.to_csv(csv_file, index=False)


def clean_text(text):
    cleaned_text = ' '.join(text.split())
    return cleaned_text



def summarize_text(text, model_checkpoint="t5-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    summarization_pipeline = pipeline("summarization", model=model_checkpoint, tokenizer=tokenizer)


    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]

    summary_parts = []

    for chunk in chunks:
        summary = summarization_pipeline(chunk, max_length=100, min_length=0, do_sample=False)
        summary_parts.append(summary[0]['summary_text'])


    final_summary = ' '.join(summary_parts)

    return final_summary

input_file = 'document_path'

output_csv = 'output_csv'

file_extension = input_file.split('.')[-1].lower()

if file_extension == 'docx':
    convert_docx_to_csv(input_file, output_csv)
elif file_extension in ('xlsx', 'xls'):
    convert_excel_to_csv(input_file, output_csv)
elif file_extension == 'pdf':
    convert_pdf_to_csv(input_file, output_csv)
elif file_extension == 'json':
    convert_json_to_csv(input_file, output_csv)
else:
    print(f"Unsupported file format: {file_extension}")

print(f"The document has been converted and saved as '{output_csv}'.")




with open(output_csv, 'r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    text = " ".join(row[0] for row in csv_reader)


cleaned_text = clean_text(text)

final_summary = summarize_text(cleaned_text, model_checkpoint="t5-large")
print("Summary:")
print(final_summary)



import torch
from transformers import BertForQuestionAnswering, BertTokenizer, pipeline



model_checkpoint ="bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_checkpoint)
Tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

question_answerer = pipeline("question-answering", model=model, tokenizer=Tokenizer)

question = input(" What is your question about the document? ")

context= cleaned_text

answer = question_answerer(question=question, context= cleaned_text,)



print("Answer:" , answer["answer"])



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The document has been converted and saved as 'output_csv'.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Your max_length is set to 100, but your input_length is only 74. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


Summary:
AI's task definition aims to imitate the functionality of human intelligence . a significant amount of advantage is obtained in different fields such as automation & robots, the automotive industry, finance, healthcare, and daily basics . requested performances by AI require a high amount of energy and massive data storage . artificial neural networks (ANNs) simulate the brain’s structure and function . neuromorphic computing uses electrical devices to realize the human brain's functions . memristors are resistive switching devices capable of emulating synaptic functions e.g. memory retention, adaptability . memristors change resistance states in response to electrical stimuli . this enables them to store and process information similarly to biological synapses . they can replicate crucial processes of learning and adaptation . configuration of HfO2-x with kind of metallic layers like ta and mo led to more facilitated and favored switching while decreasing the band gap value .

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

 What is your question about the document? What is a memristor?


  self.pid = os.fork()


Answer: resistive switching devices


# New section

In [1]:
!pip install datasets evaluate transformers["sentencepiece"]
!pip install PyPDF2
!pip install pandas
!pip install python-docx
!pip install tabula-py
!pip install pip install openpyxl
!pip install huggingface_hub
!pip install nltk
!pip install torch
!pip install transformers


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
