In [1]:
import csv
from docx import Document
import os
import fitz
import re

# Path to the existing dataset
dataset_file_path = 'D:/wamp64/www/Final_Project/explicit_words_dataset_with_age_suitability.csv'

# Load dataset once into memory
def load_dataset():
    with open(dataset_file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        dataset = list(reader)  # Read the dataset into a list of dictionaries
    return dataset

# Function to analyze the extracted text
def check_suitability(extracted_text):
    dataset = load_dataset()
    unsuitable_words = []

    for row in dataset:
        word = row['Word'].strip()  # Strip any leading/trailing spaces
        # Create a regex pattern to match the word with word boundaries, making sure it handles variations like punctuation
        pattern = r'\b' + re.escape(word) + r'\b'  # Use word boundaries to match whole words
        if re.search(pattern, extracted_text, re.IGNORECASE):
            unsuitable_words.append({
                'word': word,
                'category': row['Category'],
                'language': row['Language'],
                'age_suitability': row['Age_Suitability']
            })

    if unsuitable_words:
        return unsuitable_words
    else:
        return "No explicit words detected."

# Function to extract text from PDF
def extract_pdf_text(pdf_path):
    document = fitz.open(pdf_path)
    extracted_text = ""
    
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        extracted_text += page.get_text()
    
    document.close()
    return extracted_text

# Function to extract text from DOCX
def extract_docx_text(docx_path):
    doc = Document(docx_path)
    extracted_text = ""
    
    for paragraph in doc.paragraphs:
        extracted_text += paragraph.text
    
    return extracted_text

# Simulate file upload and text extraction
def file_upload_and_analysis(file_path):
    file_extension = file_path.split('.')[-1]
    
    extracted_text = ""
    
    if file_extension == 'csv':
        with open(file_path, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            data = list(reader)
        # Here you can implement pagination in a similar way if needed
        
    elif file_extension == 'pdf':
        extracted_text = extract_pdf_text(file_path)
    
    elif file_extension in ['docx', 'doc']:
        extracted_text = extract_docx_text(file_path)
    
    if not extracted_text:
        return "Failed to extract text from the uploaded file."
    
    analysis_result = check_suitability(extracted_text)
    
    return analysis_result

# Test the functionality with a sample file
file_path = 'D:/wamp64/www/Final_Project/media/uploads/Test1.pdf' # Replace with your file path
result = file_upload_and_analysis(file_path)
print(result)


[{'word': 'putik', 'category': 'Mild', 'language': 'Filipino', 'age_suitability': 'All ages'}, {'word': 'darn', 'category': 'Mild', 'language': 'English', 'age_suitability': 'All ages'}, {'word': 'gago', 'category': 'Mild', 'language': 'Filipino', 'age_suitability': 'All ages'}, {'word': 'push', 'category': 'Mild', 'language': 'English', 'age_suitability': 'All ages'}, {'word': 'kick', 'category': 'Mild', 'language': 'English', 'age_suitability': 'All ages'}, {'word': 'itulak', 'category': 'Mild', 'language': 'Filipino', 'age_suitability': 'All ages'}, {'word': 'flirt', 'category': 'Mild', 'language': 'English', 'age_suitability': 'All ages'}, {'word': 'landi', 'category': 'Mild', 'language': 'Filipino', 'age_suitability': 'All ages'}, {'word': 'kiss', 'category': 'Mild', 'language': 'English', 'age_suitability': 'All ages'}, {'word': 'halik', 'category': 'Mild', 'language': 'Filipino', 'age_suitability': 'All ages'}, {'word': 'crush', 'category': 'Mild', 'language': 'English', 'age_su