In [3]:
from PIL import Image
import pytesseract
# Install required libraries
import requests
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

In [4]:
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

# Load the image from file
image_path = 'test2.jpg'  # Replace with your image file path
image = Image.open(image_path)

In [5]:
extracted_text = pytesseract.image_to_string(image)

# Print the extracted text
print("Extracted Text:")
print(extracted_text)

Extracted Text:
Ww) NITTE | NMAM INSTITUTE

cnvesisteninn | OF TECHNOLOGY

Congratulations on your placement
in

ALLEGION

Mr. Saakshar Sunil Shetty Ms. Sushma S. Nayak

Department of Mechanical Engineering

apply.nitte.edu.in

www.nmamit.nitteeduin +91 95131 88844 Followuson @ ] OO



In [6]:
def preprocess_text(text):
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    print("\nTokenized Sentences:")
    print(sentences)
    
    # Tokenize into words
    words = word_tokenize(text)
    print("\nTokenized Words:")
    print(words)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    print("\nFiltered Words (Stopwords Removed):")
    print(filtered_words)

    # Calculate word frequency
    freq_dist = FreqDist(filtered_words)
    print("\nWord Frequencies:")
    for word, freq in freq_dist.most_common(10):
        print(f"{word}: {freq}")

    return {
        "sentences": sentences,
        "filtered_words": filtered_words,
        "freq_dist": freq_dist
    }



In [7]:
# Process the extracted text
processed_data = preprocess_text(extracted_text)
print(processed_data)


Tokenized Sentences:
['Ww) NITTE | NMAM INSTITUTE\n\ncnvesisteninn | OF TECHNOLOGY\n\nCongratulations on your placement\nin\n\nALLEGION\n\nMr. Saakshar Sunil Shetty Ms. Sushma S. Nayak\n\nDepartment of Mechanical Engineering\n\napply.nitte.edu.in\n\nwww.nmamit.nitteeduin +91 95131 88844 Followuson @ ] OO']

Tokenized Words:
['Ww', ')', 'NITTE', '|', 'NMAM', 'INSTITUTE', 'cnvesisteninn', '|', 'OF', 'TECHNOLOGY', 'Congratulations', 'on', 'your', 'placement', 'in', 'ALLEGION', 'Mr.', 'Saakshar', 'Sunil', 'Shetty', 'Ms.', 'Sushma', 'S.', 'Nayak', 'Department', 'of', 'Mechanical', 'Engineering', 'apply.nitte.edu.in', 'www.nmamit.nitteeduin', '+91', '95131', '88844', 'Followuson', '@', ']', 'OO']

Filtered Words (Stopwords Removed):
['Ww', ')', 'NITTE', '|', 'NMAM', 'INSTITUTE', 'cnvesisteninn', '|', 'TECHNOLOGY', 'Congratulations', 'placement', 'ALLEGION', 'Mr.', 'Saakshar', 'Sunil', 'Shetty', 'Ms.', 'Sushma', 'S.', 'Nayak', 'Department', 'Mechanical', 'Engineering', 'apply.nitte.edu.in', 

In [9]:
import requests
import json
import os
from dotenv import load_dotenv

load_dotenv()

def summarize_with_gemini(api_key, extracted_text, custom_prompt):
    """
    Summarize text using Google's Gemini API
    """
    # Define the API endpoint - using the correct Gemini API URL
    endpoint = f"https://generativelanguage.googleapis.com/v1/models/gemini-pro:generateContent?key={api_key}"
    
    # Construct the payload according to Gemini API specifications
    payload = {
        "contents": [{
            "parts": [{
                "text": f"Instructions: {custom_prompt}\n\nText to analyze: {extracted_text}"
            }]
        }],
        "generationConfig": {
            "temperature": 0.7,
            "maxOutputTokens": 300,
            "topP": 0.8,
            "topK": 40
        }
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    try:
        # Send the POST request
        response = requests.post(endpoint, headers=headers, json=payload)
        
        # Check if the request was successful
        if response.status_code == 200:
            response_data = response.json()
            
            # Extract the generated text from the response
            if ('candidates' in response_data and 
                len(response_data['candidates']) > 0 and 
                'content' in response_data['candidates'][0] and 
                'parts' in response_data['candidates'][0]['content'] and 
                len(response_data['candidates'][0]['content']['parts']) > 0 and 
                'text' in response_data['candidates'][0]['content']['parts'][0]):
                
                return {
                    "success": True,
                    "summary": response_data['candidates'][0]['content']['parts'][0]['text']
                }
            else:
                return {
                    "success": False,
                    "error": "Unexpected response structure"
                }
        else:
            error_message = response.json().get('error', {}).get('message', 'Unknown error occurred')
            return {
                "success": False,
                "error": f"API request failed with status {response.status_code}: {error_message}"
            }
            
    except requests.exceptions.RequestException as e:
        return {
            "success": False,
            "error": f"Request failed: {str(e)}"
        }

# Example usage
if __name__ == "__main__":
    # Your API key
    api_key = os.getenv('GEMINI_API_KEY') # Replace with actual API key
    
    # Example text and prompt
    extracted_text = pytesseract.image_to_string(image)
    custom_prompt = "does this text mean anything?"
    
    # Call the function
    result = summarize_with_gemini(api_key, extracted_text, custom_prompt)
    
    # Handle the result
    if result.get("success", False):
        print("Summary:", result["summary"])
    else:
        print("Error:", result.get("error", "An unknown error occurred"))

Summary: Yes, this text means something. It is a placement announcement for two students, Mr. Saakshar Sunil Shetty and Ms. Sushma S. Nayak, from the Department of Mechanical Engineering at NITTE | NMAM INSTITUTE OF TECHNOLOGY. The announcement congratulates the students on their placement in ALLEGION.

The text also includes the following information:

* The website address for the institute: apply.nitte.edu.in
* The website address for the department: www.nmamit.nitteeduin
* The phone number for the institute: +91 95131 88844
* The social media handles for the institute: @ ] OO
