# 1. Introduction üìù

# 2. EDA and Preprocessing üìä

In [None]:
# Google drive setup
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [39]:
# Github repository cloning
from google.colab import userdata # Import the userdata module
GIT_TOKEN = userdata.get('GITHUB_TOKEN') # Or use secrets manager in Colab
GIT_USERNAME = "M-Carre"
GIT_REPO = "OC-NeoBERT-POC"
REPO_NAME = "OC-NeoBERT-POC"

# --- Clean up ---
%cd /content
# Remove the existing repository directory if it exists
# The -rf flags mean recursive (delete subdirectories) and force (suppress prompts)
!rm -rf {REPO_NAME}
print(f"Removed existing directory ./{REPO_NAME} (if it existed).")

# Git clone
!git clone https://{GIT_TOKEN}@github.com/{GIT_USERNAME}/{GIT_REPO}.git

# It's good practice to navigate into your repository directory
%cd OC-NeoBERT-POC

/content
Removed existing directory ./OC-NeoBERT-POC (if it existed).
Cloning into 'OC-NeoBERT-POC'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 7 (delta 1), reused 4 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (7/7), done.
Resolving deltas: 100% (1/1), done.
/content/OC-NeoBERT-POC


In [40]:
# Cell 2: (Optional but recommended) Configure Git user for this session
# This helps identify your commits correctly
!git config --global user.email "carre.mathis@proton.me"
!git config --global user.name "M-Carre"

In [41]:
# Core Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Text Processing with spaCy
import spacy
# You might need to download the English model if you haven't already
# Run in a new cell: !python -m spacy download en_core_web_sm
# Then load it:
# nlp = spacy.load('en_core_web_sm')

# Scikit-learn for traditional ML tasks and metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Hugging Face Transformers (will be used more heavily in later phases)
# import transformers # We can import specific modules later as needed

# Other utilities
import re # For regular expressions, if needed for cleaning
import collections # For Counter, if used for frequency distributions

# Plotting settings (optional, for aesthetics)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6) # Default figure size

In [42]:
# Cell 4: Copy dataset from Drive to local (cloned) project directory and load it

import pandas as pd
import shutil # For copying files
import os

# --- Configuration ---
# Path to the dataset on your Google Drive
drive_dataset_path = '/content/drive/MyDrive/Colab Notebooks/OC/P9/Data/DBP_wiki_data.csv'

# Relative path where you want to store the dataset within your cloned project
# Example: Create a 'data' folder inside your 'OC-NeoBERT-POC' repository
local_project_data_folder = 'data' # This folder will be inside OC-NeoBERT-POC
local_dataset_filename = 'DBP_wiki_data.csv'
local_dataset_path = os.path.join(local_project_data_folder, local_dataset_filename)

# --- Ensure local data directory exists ---
# This assumes you are already in the root of your cloned repository (e.g., /content/OC-NeoBERT-POC)
if not os.path.exists(local_project_data_folder):
    os.makedirs(local_project_data_folder)
    print(f"Created local directory: {local_project_data_folder}")

# --- Copy the file from Drive to the local project directory ---
try:
    shutil.copy(drive_dataset_path, local_dataset_path)
    print(f"Dataset copied from Google Drive to: {local_dataset_path}")
except FileNotFoundError:
    print(f"ERROR: Source file not found on Google Drive at {drive_dataset_path}")
    print("Please ensure the file exists and the path is correct.")
except Exception as e:
    print(f"An error occurred while copying the file: {e}")

# --- Load the dataset using the local relative path ---
if os.path.exists(local_dataset_path):
    try:
        df = pd.read_csv(local_dataset_path)
        print("Dataset loaded successfully from local project path!")
        print(f"Shape of the dataset: {df.shape}")
        print("First 5 rows of the dataset:")
        print(df.head())
    except Exception as e:
        print(f"An error occurred while loading the dataset from {local_dataset_path}: {e}")
else:
    print(f"ERROR: Dataset not found at local project path {local_dataset_path} after attempting copy.")
    df = None # Initialize df as None if loading fails

# --- (Important) Add the local_project_data_folder to .gitignore ---
# You typically don't want to commit large data files to your Git repository.
# This step creates/appends to a .gitignore file in your project's root.
# It should be run once, or checked to ensure the data folder is listed.
gitignore_path = '.gitignore'
entry_to_add = f"\n# Ignore large data files\n{local_project_data_folder}/\n"

try:
    with open(gitignore_path, 'a+') as f: # Open in append+read mode
        f.seek(0) # Go to the beginning of the file to read its content
        content = f.read()
        if local_project_data_folder + '/' not in content: # Check if entry already exists
            f.write(entry_to_add)
            print(f"Added '{local_project_data_folder}/' to .gitignore")
        else:
            print(f"'{local_project_data_folder}/' already in .gitignore")
except Exception as e:
    print(f"An error occurred while updating .gitignore: {e}")

Created local directory: data
Dataset copied from Google Drive to: data/DBP_wiki_data.csv
Dataset loaded successfully from local project path!
Shape of the dataset: (342781, 6)
First 5 rows of the dataset:
                                                text     l1            l2  \
0  The 1994 Mindoro earthquake occurred on Novemb...  Event  NaturalEvent   
1  The 1917 Bali earthquake occurred at 06:50 loc...  Event  NaturalEvent   
2  The 1941 Colima earthquake occurred on April 1...  Event  NaturalEvent   
3  The 1983 Coalinga earthquake occurred on May 2...  Event  NaturalEvent   
4  The 2013 Bushehr earthquake occurred with a mo...  Event  NaturalEvent   

           l3                 wiki_name  word_count  
0  Earthquake   1994_Mindoro_earthquake          59  
1  Earthquake      1917_Bali_earthquake          68  
2  Earthquake    1941_Colima_earthquake         194  
3  Earthquake  1983_Coalinga_earthquake          98  
4  Earthquake   2013_Bushehr_earthquake          61  
'data/'

# 3. Baseline Models üìâ

# 4. NeoBERT Implementation (New Model) ‚ú®

# 5. Results Comparison and Analysis üìà

# GitHub Push üñ•Ô∏è

In [44]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [None]:
# (End of Session): Check Git status
# This shows you what files have changed
!git status

# Add all new and modified files in the current directory and subdirectories
!git add .

# Replace "Your detailed commit message" with a meaningful description of your changes
COMMIT_MESSAGE = ""
!git commit -m "{COMMIT_MESSAGE}"

# Push changes to GitHub
!git push origin main

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   .gitignore[m

no changes added to commit (use "git add" and/or "git commit -a")
[main 7ce8158] GitHub Setup
 1 file changed, 3 insertions(+), 1 deletion(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 342 bytes | 342.00 KiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/M-Carre/OC-NeoBERT-POC.git
   157f80e..7ce8158  main -> main
