In [1]:
!pip install spacy tqdm
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import csv
import spacy
from tqdm import tqdm
from collections import defaultdict
import logging


In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

In [6]:
def extract_subject_action(description):
    doc = nlp(description)
    subject = next((token.text for token in doc if token.dep_ == "nsubj"), None)
    #action = next((token.text for token in doc if token.pos_ == "VERB"), None)
    #return subject, action
    return subject

In [7]:
def process_csv(input_file, output_file, batch_size=1000):
    total_processed = 0
    categories = defaultdict(lambda: defaultdict(int))

    with open(input_file, 'r', encoding='utf-8', errors='replace') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile)
       # writer.writerow(['Image URL', 'Description', 'Subject', 'Action'])  # Write header
        writer.writerow(['Image URL', 'Description', 'Subject'])  # Write header

        batch = []
        for row in tqdm(reader, desc="Processing rows"):
            if len(row) >= 2:
                url, description = row[0], row[1]
                batch.append((url, description))

                if len(batch) >= batch_size:
                    process_batch(batch, writer, categories)
                    total_processed += len(batch)
                    batch = []
            else:
                logging.warning(f"Skipping malformed row: {row}")

        # Process any remaining rows
        if batch:
            process_batch(batch, writer, categories)
            total_processed += len(batch)

    logging.info(f"Total records processed: {total_processed}")
    return total_processed, categories

In [8]:
def process_batch(batch, writer, categories):
    for url, description in batch:
        #subject, action = extract_subject_action(description)
        subject = extract_subject_action(description)
        #writer.writerow([url, description, subject, action])
        writer.writerow([url, description, subject])
        #if subject and action:
        #    categories[subject][action] += 1
        if subject:
            #categories[subject] += 1
            categories[subject] = categories.get(subject, 0) + 1 # Increment the count for the subject. If the subject is not found, initialize it with 0 and then increment.

In [9]:
def generate_summary(categories, total_records):
    print(f"\nTotal records processed: {total_records}")
    #print("\nTop 10 subjects and their top 3 actions:")

    #for subject, actions in sorted(categories.items(), key=lambda x: sum(x[1].values()), reverse=True)[:10]:
    #    print(f"\nSubject: {subject}")
    #    for action, count in sorted(actions.items(), key=lambda x: x[1], reverse=True)[:3]:
    #        print(f"  - Action: {action}, Count: {count}")


In [10]:
def main(input_file, output_file):
    total_processed, categories = process_csv(input_file, output_file)
    print(f"Results have been written to {output_file}")
    generate_summary(categories, total_processed)

if __name__ == "__main__":
    input_file = "/content/drive/My Drive/tgif-v1.0.csv"
    #input_file = "/content/drive/My Drive/tgif-v10V2.csv"
    output_file = "/content/drive/My Drive/categorization_subject.csv"
    main(input_file, output_file)


Processing rows: 125782it [19:01, 110.18it/s]


Results have been written to /content/drive/My Drive/categorization_subject_action.csv

Total records processed: 125782
