### Importing the libraries

In [10]:
import threading
import urllib.request
import io
from tika import parser
import time
import pandas as pd
from functools import partial

### Reading the data

In [None]:
train = pd.read_excel("mle-1.xlsx", sheet_name="train_data")
test = pd.read_excel("mle-1.xlsx", sheet_name="test_data")
train.head()

In [4]:
train.shape

(1895, 2)

### Distribution of target column

In [5]:
train.target_col.value_counts()

target_col
lighting    500
fuses       500
cable       500
others      395
Name: count, dtype: int64

### Processing the PDF URLs

In [11]:
class TimeoutError(Exception):
    pass

def monitor_execution_time(func, args=(), kwargs={}, timeout=120):
    result = []
    error = []
    
    def target():
        try:
            result.append(func(*args, **kwargs))
        except Exception as e:
            error.append(e)
    
    thread = threading.Thread(target=target)
    thread.start()
    thread.join(timeout)
    
    if thread.is_alive():
        thread.join(1)  # Give the thread a second to clean up
        raise TimeoutError(f"Error: Function execution exceeded {timeout} seconds")
    
    if error:
        raise error[0]
    
    return result[0]

def getData(URL):
    try:
        # Set up headers to mimic a browser request
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        
        # Fetch the PDF content from the URL
        req = urllib.request.Request(URL, headers=headers)
        remote_file = urllib.request.urlopen(req).read()
        
        # Create a BytesIO object from the remote file
        remote_file_bytes = io.BytesIO(remote_file)
        
        # Parse the PDF content with Tika
        parsed_pdf = parser.from_buffer(remote_file_bytes)
        data = parsed_pdf['content']
        
        return data
    except Exception as e:
        return f"Error: {str(e)}"
    
def process_pdf_with_timeout(url, timeout=120):
    try:
        return monitor_execution_time(getData, args=(url,), timeout=timeout)
    except TimeoutError:
        return f"Error: Processing exceeded {timeout} seconds"
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
train['text'] = train['datasheet_link'].apply(process_pdf_with_timeout)
test['text'] = test['datasheet_link'].apply(process_pdf_with_timeout)

### Saving the results for further processing

In [None]:
train.to_excel("train.xlsx")
test.to_excel("test.xlsx")