In [14]:
import os
import re
import PyPDF2
from sklearn.feature_extraction.text import CountVectorizer

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    num_pages = len(reader.pages)

    # Ignore PDFs with just one page
    if num_pages == 1:
        return None
    
    full_text = ""
    for page_num in range(num_pages):
        page = reader.pages[page_num]
        full_text += page.extract_text()
    return full_text

# Regex patterns to extract date, company name, and participants
date_pattern = r"Date[:|\s](.*)"  # Date pattern can be improved
company_name_pattern = r"Company[:|\s](.*)"
participant_pattern = r"Participants[:|\s]([\s\S]*?)\n"

# Function to preprocess text and extract needed data
def preprocess_text(text):
    date = re.search(date_pattern, text)
    company_name = re.search(company_name_pattern, text)
    participants = re.findall(participant_pattern, text)
    
    date = date.group(1) if date else None
    company_name = company_name.group(1) if company_name else None
    participants = participants if participants else []
    
    return date, company_name, participants

# Function to club question and answers into pairs
def get_qna_pairs(text):
    qna_pattern = r"Q[:|\s](.*?)\nA[:|\s](.*?)\n"
    qna_pairs = re.findall(qna_pattern, text)
    return qna_pairs

# Function to extract ngrams (unigram and bigram features)
def extract_ngram_features(texts, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    ngrams = vectorizer.fit_transform(texts)
    return vectorizer, ngrams

# Main function to process a directory of PDFs
def process_pdfs(pdf_dir):
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, filename)
            text = extract_text_from_pdf(pdf_path)
            
            if text is None:
                print(f"Ignored {filename} due to empty or single page.")
                continue
            
            # Preprocessing step
            date, company_name, participants = preprocess_text(text)
            print(f"Date: {date}, Company: {company_name}, Participants: {participants}")
            
            # Extract Q&A pairs
            qna_pairs = get_qna_pairs(text)
            print(f"Q&A pairs: {qna_pairs}")
            
            # Feature Engineering: Extract ngram features
            if qna_pairs:
                questions, answers = zip(*qna_pairs)
                vectorizer, ngrams = extract_ngram_features(list(questions) + list(answers))
                print(f"Ngram features extracted for {filename}")


print(extract_text_from_pdf('../crawler/test/4425823.pdf'))


sp~rt'iqCJINDIA LTD.
(Govt. Recognised FourStarExport House)
Regd.&CorporateOffice: Vill.Kanech,NearSahnewal, G.T.Road,Ludhiana-141120 Ph.(0161)2845456to60Fax:2845458
Admn.Office :178,Col.GurdialSinghRoad,CivilLines,Ludhiana-141001 Ph.(0161)2770954to55Fax:2770953
E-mail :sportking@sportking.co.inCINNo.L17122PB1989PLC053162
Website :www.sportking.co.in GSTNo.:03AAACS3037Q1 ZA
SIL/2023-24/SE Date: 29.01.2024
To To
BSELimited National Stock Exchange ofIndia Ltd,
PhirozeJeeheebhoy Towers, Exchange Plaza,Bandra KurlaComplex,
DalalStreet,Mumbai-40000 1 Bandra(East),Mumbai- 400051
Script Code: 539221 Symbol: SPORTKING
Subject: Transcript ofEarnings Call ofSportking India Limited forquarter/nine
month ended 31.12.2023
Dear Sir,
Pursuant toRegulation 30andotherapplicable provisions ofSEBI(ListingObligations and
Disclosure Requirements), Regulations 2015,pleasefindenclosed herewith transcript ofthe
earnings calloftheCompany heldonTuesday 23rdJanuary 2023todiscusstheCompany's
Financial Performanc

### EDA

In [24]:
import os

fileStore = '../crawler/files/'
NullCounter = 0 
fileStoreLen = len(os.listdir(fileStore))

for filename in os.listdir(fileStore):
    file_path = os.path.join(fileStore, filename)
    if os.path.getsize(file_path) == 0:
        os.remove(file_path)
        NullCounter += 1
        print(f"Removed empty file: {filename}")
print(f"Total empty files removed: {NullCounter}")
print(f"Total files in the directory: {fileStoreLen}")


Total empty files removed: 0
Total files in the directory: 4018


In [31]:
import re
import PyPDF2

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        full_text = ''
        for page_num in range(len(reader.pages)):
            full_text += reader.pages[page_num].extract_text()
    return full_text

# Generalized Regex patterns for flexibility
date_patterns = [
    r"(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},\s\d{4}",
    r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"  # Flexible pattern for date in various formats like 01/23/2024 or 23-01-2024
]

company_patterns = [
    r"For\s+([A-Za-z&.,\s]+(?:Ltd|Limited|Inc|Corp|Corporation|Plc)?)"

]

participants_patterns = [
    r"(?:Mr\.|Ms\.|Dr\.|Mrs\.)\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\s–\s([A-Za-z\s]+)",  # Matches participants with roles
    r"(?:Participants:\s)((?:.*?–.*?[\n,])+)"  # Captures list format like "Participants: John Doe – CEO, Jane Doe – CFO"
]

qa_patterns = [
    r"(Moderator:.*?)(?=Moderator|Question|Answer|$)",  # General pattern to capture moderator or any Q&A block
    r"(Question:.*?Answer:.*?)(?=Question|Answer|$)",   # Flexible pattern to capture pairs of Q&A blocks
    r"(Q:\s.*?A:\s.*?)(?=Q:|A:|$)"  # Alternative for documents using Q: and A: notation
]

# Function to preprocess the text and extract relevant fields using multiple regex patterns
def preprocess_text(text):
    # Extract Date
    date = None
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            date = match.group(0)
            break
    
    # Extract Company Name
    company = None
    for pattern in company_patterns:
        match = re.search(pattern, text)
        if match:
            company = match.group(0)
            break
    
    # Extract Participants
    participants = []
    for pattern in participants_patterns:
        matches = re.findall(pattern, text)
        if matches:
            participants.extend(matches)
    
    # Extract Q&A
    qa_pairs = []
    for pattern in qa_patterns:
        matches = re.findall(pattern, text, re.DOTALL)
        if matches:
            qa_pairs.extend(matches)
    
    return date, company, participants, qa_pairs

# Function to display results
def display_results(date, company, participants, qa_pairs):
    print(f"Date: {date if date else 'Date not found'}")
    print(f"Company Name: {company if company and ('BSE' or 'NSE' not in company) else 'Company not found'}")
    
    print("\nParticipants:")
    if participants:
        for participant in participants:
            print(f"Participant: {participant.strip()}")
    else:
        print("No participants found.")
    
    print("\nQuestions and Answers:")
    if qa_pairs:
        for qa in qa_pairs:
            print(qa.strip())
    else:
        print("No Q&A found.")

# Main function to process the PDF
def process_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    
    if text:
        date, company, participants, qa_pairs = preprocess_text(text)
        display_results(date, company, participants, qa_pairs)
    else:
        print("No text found in the PDF.")

# Example Usage
pdf_path = ".//test/4425823.pdf"  # Replace with your PDF path
process_pdf(pdf_path)


Date: January 23, 2024
Company Name: For Q

Participants:
No participants found.

Questions and Answers:
Moderator: Ladies and gentlemen, good day and welcome to Sportking India Limited Q3 and 9 Month 
FY24 Earnings Conference Call. As a reminder, all participant lines will be in the listen-only 
mode and there will be an opportunity for you to ask questions after the presentation 
concludes. Should you need assistance during the conference call, please signal an operator by 
pressing star then zero on your touchtone phone. 
Please note that this conference is being recorded. I now hand the conference over to Mr. 
Devansh Dedhia. Thank you and over to you Mr. Dedhia. 
Devansh Dedhia: Thank you Yusuf. Good evening everyone. On behalf of Sportking India Limited, I extend a 
very warm welcome to all participants on the Q3 and 9-Month FY24 Financial Results 
Discussion Call. Today on the call we have Mr. Munish Avasthi, Chairman and Managing 
Director, Mr. Sandeep Sachdeva, Chief Financial