In [None]:
!pip install datasets evaluate transformers["sentencepiece"]
!pip install PyPDF2
!pip install pandas
!pip install python-docx
!pip install tabula-py
!pip install pip install openpyxl
!pip install huggingface_hub
!pip install nltk
!pip install torch
!pip install transformers

In [None]:
import pandas as pd
from docx import Document
import tabula
import json
import openpyxl
import csv
import PyPDF2
from transformers import pipeline, AutoTokenizer
import nltk
nltk.download("punkt")




def convert_docx_to_csv(docx_file, csv_file):
    doc = Document(docx_file)
    text_content = []
    for paragraph in doc.paragraphs:
        text_content.append(paragraph.text)
    ink = pd.DataFrame({'Text': text_content})
    ink.to_csv(csv_file, index=False)


def convert_pdf_to_csv(pdf_file, csv_file):
    pdf_text = ""
    with open(pdf_file, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            pdf_text += page.extract_text()


    lines = pdf_text.split('\n')


    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)


        for line in lines:
          csv_writer.writerow([line])



def convert_json_to_csv(json_file, csv_file):
    with open(json_file, 'r') as json_data:
        data = json.load(json_data)
    ink = pd.DataFrame(data)
    ink.to_csv(csv_file, index=False)


def convert_excel_to_csv(excel_file, csv_file):
    xls = openpyxl.load_workbook(excel_file)
    sheet = xls.active
    data = sheet.values
    ink = pd.DataFrame(data)
    ink.to_csv(csv_file, index=False)


def clean_text(text):
    cleaned_text = ' '.join(text.split())
    return cleaned_text



def summarize_text(text, model_checkpoint="t5-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    summarization_pipeline = pipeline("summarization", model=model_checkpoint, tokenizer=tokenizer)


    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]

    summary_parts = []

    for chunk in chunks:
        summary = summarization_pipeline(chunk, max_length=100, min_length=0, do_sample=False)
        summary_parts.append(summary[0]['summary_text'])


    final_summary = ' '.join(summary_parts)

    return final_summary

input_file = 'document_path'

output_csv = 'output_csv'

file_extension = input_file.split('.')[-1].lower()

if file_extension == 'docx':
    convert_docx_to_csv(input_file, output_csv)
elif file_extension in ('xlsx', 'xls'):
    convert_excel_to_csv(input_file, output_csv)
elif file_extension == 'pdf':
    convert_pdf_to_csv(input_file, output_csv)
elif file_extension == 'json':
    convert_json_to_csv(input_file, output_csv)
else:
    print(f"Unsupported file format: {file_extension}")

print(f"The document has been converted and saved as '{output_csv}'.")




with open(output_csv, 'r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    text = " ".join(row[0] for row in csv_reader)


cleaned_text = clean_text(text)

final_summary = summarize_text(cleaned_text, model_checkpoint="t5-large")
print("Summary:")
print(final_summary)



import torch
from transformers import BertForQuestionAnswering, BertTokenizer, pipeline



model_checkpoint ="bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_checkpoint)
Tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

question_answerer = pipeline("question-answering", model=model, tokenizer=Tokenizer)

question = input(" What is your question about the document? ")

context= cleaned_text

answer = question_answerer(question=question, context= cleaned_text,)



print("Answer:" , answer["answer"])

