<a href="https://colab.research.google.com/github/MK316/Myapps/blob/main/TCE_textextraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 2: Install required libraries
%%capture
!sudo apt-get install tesseract-ocr
!pip install pytesseract
!pip install pandas

In [None]:
import pytesseract
import pandas as pd
from PIL import Image
import os

# Step 3: Set the path to the Google Drive folder containing your images
image_folder_path = "/content/drive/My Drive/datafiles/TCE"  # Change this to your Google Drive folder

# Step 4: Make a list of image files (png, PNG, JPG) in the folder
image_file_list = [f for f in os.listdir(image_folder_path) if f.endswith(('.png', '.PNG', '.JPG', '.jpg'))]

# Step 5: Create an empty list to store extracted data
extracted_data = []

# Step 6: For each image file, extract text and save filename and text to the dataframe
for image_file in image_file_list:
    # Load the image
    image_path = os.path.join(image_folder_path, image_file)
    img = Image.open(image_path)

    # Extract text from the image (English only)
    extracted_text = pytesseract.image_to_string(img, lang='eng')

    # Append filename and text to the list
    extracted_data.append([image_file, extracted_text])

# Step 7: Create a dataframe with 'Filename' and 'Text' columns
df = pd.DataFrame(extracted_data, columns=['Filename', 'Text'])

# Step 8: Display the dataframe
df.head()

# (Optional) Save the dataframe to a CSV file in your Google Drive
df.to_csv('/content/extracted_texts.csv', encoding='utf-8', index=False)


# Textdata to process

In [None]:
import pandas as pd
import re

# Step 1: Read the CSV file from your Colab folder
df = pd.read_csv('/content/extracted_texts.csv')

# Step 2: Define a function to clean broken characters, remove strings inside square brackets, angle brackets, numbers inside parentheses, and non-word strings
def clean_text(text):
    # Remove text inside square brackets, including the brackets themselves
    text = re.sub(r'\[.*?\]', '', text)

    # Remove text inside angle brackets <>, including the brackets
    text = re.sub(r'<.*?>', '', text)

    # Remove numbers inside parentheses (), including the parentheses
    text = re.sub(r'\(.*?\d+.*?\)', '', text)

    # Remove unwanted broken characters (Replace '?' and broken characters)
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Removes non-ASCII characters

    # Remove non-word strings (anything that is not a letter, digit, or underscore)
    text = re.sub(r'\W+', ' ', text)  # Replaces non-word characters with a space

    # Optionally, you can remove leading/trailing whitespaces
    text = text.strip()

    return text

# Step 3: Apply the cleaning function to the 'Text' column and store the cleaned text in a new column 'Cleantext'
df['Cleantext'] = df['Text'].apply(clean_text)

# Step 4: Save the cleaned DataFrame as a new CSV file with utf-8 encoding
df.to_csv('/content/CleanTCE.csv', index=False, encoding='utf-8')

# Step 5: Check the first few rows of the cleaned DataFrame
df.head()


# Frequency list

In [None]:
import pandas as pd
import re
from collections import Counter

# Step 1: Load the CSV file
url = "CleanTCE.csv"  # Make sure your CSV file is in your Colab environment or replace it with the correct path
df = pd.read_csv(url, encoding='utf-8-sig')

# Step 2: Combine all the text from the 'TEXT' column into a single string
combined_text = " ".join(df['Cleantext'].dropna())  # Drop any NaN values if they exist, then combine

# Step 3: Save the combined text as 'tceall.txt'
with open('/content/tceall.txt', 'w', encoding='utf-8') as f:
    f.write(combined_text)

# Step 4: Clean the text and prepare it for frequency analysis
# Convert the text to lowercase, remove punctuation and split it into words
cleaned_text = re.sub(r'[^\w\s]', '', combined_text.lower())  # Remove punctuation and convert to lowercase
words = cleaned_text.split()  # Split text into individual words

# Step 5: Create a frequency list of the words
word_freq = Counter(words)  # Use Counter to count the frequency of each word

# Step 6: Convert the frequency list into a DataFrame
freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency'])

# Step 7: Sort the frequency list by the most frequent words
freq_df = freq_df.sort_values(by='Frequency', ascending=False)

# Step 8: Save the frequency list as 'TCEfreq.csv'
freq_df.to_csv('/content/TCEfreq.csv', index=False, encoding='utf-8')

# Display the top 10 most frequent words as a quick check
freq_df.head(10)


In [None]:
import pandas as pd

# Step 1: Load the existing frequency file 'TCEfreq.csv'
df = pd.read_csv('/content/TCEfreq.csv', encoding='utf-8')

# Step 2: Filter rows where the 'Word' column contains strings with more than 4 characters
df_long_words = df[df['Word'].apply(lambda x: len(x) > 4)]

# Step 3: Save the filtered DataFrame as 'TCEfreq-long.csv'
df_long_words.to_csv('/content/TCEfreq-long.csv', index=False, encoding='utf-8')

# Step 4: Display the first few rows of the filtered DataFrame as a check
df_long_words.head()


In [None]:
len(df_long_words['Word'])

# Display a sentence with a keyword (Incomplete)

In [None]:
%%capture
!pip install gradio

In [None]:
import pandas as pd
import re
import gradio as gr

# Step 1: Load the CSV file 'extracted_texts.csv'
df = pd.read_csv('/content/extracted_texts.csv', encoding='utf-8')

# Step 2: Combine all the text from the 'Text' column into a single string
combined_text = " ".join(df['Text'].dropna())  # Drop any NaN values, then combine the text

# Step 3: Split the text into sentences while keeping punctuation intact
sentences = re.split(r'(?<=[.!?]) +', combined_text)  # Split at end of sentences (.!?), followed by space

# Step 4: Define the function to search for sentences containing a specific word
def search_sentences(word):
    # Find sentences containing the word (case insensitive)
    matching_sentences = [sentence for sentence in sentences if re.search(rf'\b{word}\b', sentence, re.IGNORECASE)]

    if matching_sentences:
        # Return matching sentences, joining them with a double newline for readability
        return '\n\n'.join(matching_sentences)
    else:
        # If no matches found, return this message
        return f"No sentences found containing the word: {word}"

# Step 5: Create the Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# Sentence Search Engine 🔍")
    gr.Markdown("Search for sentences containing a specific word from the text.")

    # Textbox for user to input the search word
    word_input = gr.Textbox(label="Enter a word to search for", placeholder="e.g., education")

    # Button to trigger the search
    search_button = gr.Button("Search Sentences")

    # Output area for displaying the results
    output = gr.Textbox(label="Sentences Containing the Word", interactive=False)

    # Connect the button to the search function
    search_button.click(fn=search_sentences, inputs=[word_input], outputs=output)

# Step 6: Launch the Gradio app
app.launch()
