In [10]:
# Generic Libraries
from PIL import Image
import os
import pandas as pd
import numpy as np
import re,string,unicodedata

#Tesseract Library
import pytesseract

#Warnings
import warnings
warnings.filterwarnings("ignore")

#Garbage Collection
import gc

#Gensim Library for Text Processing
import gensim.parsing.preprocessing as gsp
from gensim import utils

#TextBlob Library (Sentiment Analysis)
from textblob import TextBlob, Word

#Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#WordCloud Generator
from wordcloud import WordCloud,STOPWORDS

In [11]:
#Define Directory Path
sample_images = '/Users/zwang/Documents/GitHub/Structured_Document_-Extracting/archive/Sample Data Files'
test_images = '/Users/zwang/Documents/GitHub/Structured_Document_-Extracting/archive/Dataset'

In [12]:
#Custom Function to Traverse the folder
def traverse(directory):
    path, dirs, files = next(os.walk(directory))
    fol_nm = os.path.split(os.path.dirname(path))[-1]
    print(f'Number of files found in "{fol_nm}" : ',len(files))

In [13]:
#Traversing the folders
traverse(sample_images)
traverse(test_images)

Number of files found in "archive" :  3
Number of files found in "archive" :  239


In [14]:
ex_txt = []   #list to store the extracted text

#Function to Extract Text
def TxtExtract(directory):
    """
    This function will handle the core OCR processing of images.
    """
    
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            filepath = subdir + os.sep + file
            text = pytesseract.image_to_string(Image.open(filepath), timeout=5)
            if not text:
                ex_txt.extend([[file, "blank"]])
            else:   
                ex_txt.extend([[file, text]])
                
    fol_nm = os.path.split(os.path.dirname(subdir))[-1]
    
    print(f"Text Extracted from the files in '{fol_nm}' folder & saved to list..")

In [15]:
#Extracting Text from JPG files in Sample Image Folder
TxtExtract(sample_images)

#Extracting Text from JPG files in Dataset Folder
TxtExtract(test_images)

Text Extracted from the files in 'archive' folder & saved to list..
Text Extracted from the files in 'archive' folder & saved to list..


In [16]:
#Converting the list to dataframe for further analysis
ext_df = pd.DataFrame(ex_txt,columns=['FileName','Text'])

In [17]:
#Inspect the dataframe
ext_df.head()

Unnamed: 0,FileName,Text
0,Sample_Negative.jpg,Of course gay men\n\ndress well. They didn't\n...
1,Sample_Positive.jpg,blank
2,Sample_Random.jpg,blank
3,Test519.jpg,(6) dreamstime.com ID 145823917 © Barrirret\n
4,Test243.jpg,LGBTQ FREEDOM\n= Middle Ea\n\n


In [18]:
print("Total Records: ", ext_df.shape[0])

Total Records:  242


In [19]:
# Create list of pre-processing func (gensim)
processes = [
               gsp.strip_tags, 
               gsp.strip_punctuation,
               gsp.strip_multiple_whitespaces,
               gsp.strip_numeric,
               gsp.remove_stopwords, 
               gsp.strip_short, 
               gsp.stem_text
            ]

# Create func to pre-process text
def proc_txt(txt):
    text = txt.lower()
    text = utils.to_unicode(text)
    for p in processes:
        text = p(text)
    return text

In [20]:
#Creating a new column with processed text
ext_df['Text_Pr'] = ext_df['Text'].apply(lambda x: proc_txt(x))

In [21]:
#Creating a seperate dataframe with non-blank Text 
ext_df_txt = ext_df[(  ext_df['Text_Pr'] != 'blank'  )]

print("Total Records in Text Only Dataframe: ", ext_df_txt.shape[0])

Total Records in Text Only Dataframe:  129


In [22]:
# Creating a function to analyse the tweet sentiments

def sentiment_analyzer(text):
    TB_sentiment_polarity = TextBlob(text).sentiment.polarity
    
    # decide sentiment as positive, negative and neutral 
    if TB_sentiment_polarity >= 0.00 : 
        return "Positive" 
  
    elif TB_sentiment_polarity <= 0.00 : 
        return "Negative" 
  
    else : 
        return "Neutral"

In [23]:
#Analysing the sentiment
ext_df_txt['Sentiment'] = ext_df_txt['Text_Pr'].apply(lambda x: sentiment_analyzer(x))