# Re Running ALL The Code Again

In [None]:
# Step 1 — Install packages

In [None]:
!pip install paddlepaddle paddleocr opencv-python-headless pandas numpy pytesseract
!apt-get update
!apt-get install -y tesseract-ocr


In [None]:
# Step 2 — Upload ZIP file

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Step 3 — Extract ZIP

In [None]:
import zipfile
import os

zip_path = "JS Bank Stock Image.zip"
extract_folder = "jsbank_images"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print("Top-level files/folders:")
print(os.listdir(extract_folder))


In [None]:
# Step 4 — List all images inside the inner folder

In [None]:
inner_folder = os.path.join(extract_folder, "JS Bank Stock Image Dataset YOLO Model")
image_files = os.listdir(inner_folder)
print("Images found:")
for f in image_files:
    print(f)


In [None]:
# Step 5 — Load a sample image

In [None]:
import cv2
from matplotlib import pyplot as plt

image_path = os.path.join(inner_folder, "1763746093882.jpeg")
img = cv2.imread(image_path)

if img is None:
    print("❌ Error: Cannot load the image!")
else:
    print("✅ Image loaded successfully.")
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(12,10))
    plt.imshow(img_rgb)
    plt.axis("off")
    plt.show()


In [None]:
# Step 6 — Run OCR using pytesseract

In [None]:
import pytesseract
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray)
print("---- Extracted Text ----")
print(text)


In [None]:
# Step 7 — Optional: OCR all images in the folder

In [None]:
all_texts = {}

for fname in image_files:
    path = os.path.join(inner_folder, fname)
    img = cv2.imread(path)
    if img is None:
        continue
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    all_texts[fname] = text
for k, v in all_texts.items():
    print(f"--- {k} ---")
    print(v)
    print()


In [None]:
# Step 8 — Clean OCR text and extract numbers

In [None]:
!pip install python-docx
from docx import Document
import re
doc = Document()
for fname, text in all_texts.items():
    cleaned = re.sub(r'[^\x00-\x7F]+',' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    doc.add_heading(f"Image: {fname}", level=1)
    doc.add_paragraph(cleaned)
    doc.add_page_break()
doc.save("Extracted_Texts.docx")
print("✅ DOCX document created: Extracted_Texts.docx")


# Textual Analaysis (Natural Language Processing)

In [None]:
# Load Extracted Document (TXT or DOCX)

In [None]:
from docx import Document

doc = Document("Extracted_Texts.docx")

full_text = []
for para in doc.paragraphs:
    full_text.append(para.text)

text = "\n".join(full_text)

print(text)


In [None]:
# Basic Cleaning

In [None]:
import re

def clean_text(text):
    text = re.sub(r'Image:\s*\d+\.(?:jpg|jpeg|png)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[|¬¦]+', ' ', text)
    text = re.sub(r"[^A-Za-z0-9À-ÖØ-öø-ÿ.,!?;:%\-\s]", " ", text)
    text = re.sub(r'[-]{2,}', ' ', text)
    text = re.sub(r'[.,!?;:]{2,}', lambda m: m.group(0)[0], text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text
cleaned_text = clean_text(text)

In [None]:
cleaned_text

In [None]:
# Tokenization + Stopword Removal

In [None]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
tokens = word_tokenize(cleaned_text.lower())
stops = set(stopwords.words("english"))
filtered_tokens = [t for t in tokens if t not in stops and t.isalpha()]

In [None]:
filtered_tokens


In [None]:
# Named Entity Recognition (NER)

In [None]:
import spacy
from collections import defaultdict
nlp = spacy.load("en_core_web_sm")
doc = nlp(cleaned_text)

entities = defaultdict(set)
for ent in doc.ents:
    entities[ent.label_].add(ent.text)

print("\n\n==== NAMED ENTITIES GROUPED ====")
for label, vals in entities.items():
    print(f"\n{label}:")
    for v in sorted(vals):
        print("  -", v)
nlp = spacy.load("en_core_web_sm")
doc = nlp(cleaned_text)

entities = defaultdict(set)
for ent in doc.ents:
    entities[ent.label_].add(ent.text)

print("\n\n==== NAMED ENTITIES GROUPED ====")
for label, vals in entities.items():
    print(f"\n{label}:")
    for v in sorted(vals):
        print("  -", v)

In [None]:
# Sentiment Analysis

In [None]:
!pip install textblob
from textblob import TextBlob
sentiment = TextBlob(cleaned_text).sentiment
print("Polarity:", sentiment.polarity)
print("Subjectivity:", sentiment.subjectivity)

In [None]:
# Keyword Extraction (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=20, stop_words='english')
tfidf = vectorizer.fit_transform([cleaned_text])
keywords = vectorizer.get_feature_names_out()
print("Top Keywords:", keywords)

In [None]:
# Automatic Text Summarization

In [None]:
cleaned_text = clean_text(text)

!pip install transformers --quiet
from transformers import pipeline
import math

# Initialize summarizer
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6"
)

# Function to split text into chunks for summarization
def chunk_text(text, max_len=1000):
    """
    Split text into chunks of roughly max_len characters at sentence boundaries.
    """
    import nltk
    nltk.download("punkt", quiet=True)
    from nltk.tokenize import sent_tokenize

    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sent in sentences:
        if len(current_chunk) + len(sent) <= max_len:
            current_chunk += " " + sent
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Split cleaned_text into manageable chunks
chunks = chunk_text(cleaned_text, max_len=1000)

# Generate summary for each chunk
summaries = []
for chunk in chunks:
    summary_chunk = summarizer(
        chunk,
        max_length=150,
        min_length=50,
        do_sample=False
    )
    summaries.append(summary_chunk[0]["summary_text"])

  # Combine chunk summaries into final summary
final_summary = " ".join(summaries)

print("\n=== FINAL SUMMARY ===\n")
print(final_summary)


In [None]:
# Sentiment Analysis with TextBlob and Visualization

In [None]:
!pip install textblob matplotlib seaborn --quiet
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
sentences = cleaned_text.split('.')
polarities = []
subjectivities = []
for sent in sentences:
    blob = TextBlob(sent)
    polarities.append(blob.sentiment.polarity)
    subjectivities.append(blob.sentiment.subjectivity)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(polarities, bins=20, kde=True, color='skyblue')
plt.title("Sentiment Polarity Distribution")
plt.xlabel("Polarity (-1 negative → 1 positive)")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(subjectivities, bins=20, kde=True, color='salmon')
plt.title("Subjectivity Distribution")
plt.xlabel("Subjectivity (0 objective → 1 subjective)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Word Frequency Visualization (Bar Chart / Word Cloud)

In [None]:
!pip install wordcloud

In [None]:

from collections import Counter
from wordcloud import WordCloud
word_freq = Counter(filtered_tokens)
top_words = word_freq.most_common(100)
words, counts = zip(*top_words)
plt.figure(figsize=(18,10))
sns.barplot(x=list(words), y=list(counts), palette="viridis")
plt.title("Top 20 Words in Extracted Text")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.xticks(rotation=45)
plt.show()



In [None]:
# Word Cloud
wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Named Entity Recognition (NER) Visualization

In [None]:
!pip install spacy matplotlib seaborn --quiet
import spacy
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
nlp = spacy.load("en_core_web_sm")
doc = nlp(cleaned_text)
entities = [ent.label_ for ent in doc.ents]
entity_counts = Counter(entities)

In [None]:

plt.figure(figsize=(20,8))
sns.barplot(x=list(entity_counts.keys()), y=list(entity_counts.values()), palette="magma")
plt.title("Named Entity Types Count")
plt.ylabel("Frequency")
plt.xlabel("Entity Type")
plt.show()

In [None]:
# Combined Sentiment over Text (Line Plot)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
plt.plot(polarities, color='blue', marker='o', linestyle='-')
plt.title("Sentence-wise Polarity Trend")
plt.xlabel("Sentence Index")
plt.ylabel("Polarity (-1 negative → 1 positive)")
plt.grid(True)
plt.show()
