In [35]:
import pandas as pd
from langdetect import detect
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy

In [52]:
#NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
sentiment_bearing_stopwords = ['not', 'no', 'nor', 'never', 'yes', 'should', 'could', 'would']
stop_words_without_sentiment = [word for word in stop_words if word not in sentiment_bearing_stopwords]

# spaCY for NER
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kevinbrundler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevinbrundler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kevinbrundler/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
df_osdg = pd.read_csv('osdg_cleaning/osdg-community-data-v2024-01-01.csv', delimiter='\t')

In [66]:
def synthetic_data_reading():

    synthetic_data = []

    dir = os.path.join(os.getcwd(), "synthetic_data", "produced_data", "gen_results")

    for folder in os.listdir(dir):
        label = folder.replace("sdg_goal_", "")

        for data in os.listdir(os.path.join(dir, folder)):
            if data.endswith(".jsonl"):
                df = pd.read_json(os.path.join(dir, folder, data), lines=True)
                synthetic_data.append(df)

    df_synthetic = pd.concat(synthetic_data)
    return df_synthetic

In [67]:
def remove_urls_and_html_tags(text):
    html_tags_pattern = r'<.*?>'
    text_without_html_tags = re.sub(html_tags_pattern, '', text)
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text_without_html_tags)

In [68]:
def named_entity_regocnition(text):
    doc = nlp(text)
    return ["".join(ent.text) for ent in doc.ents]

In [69]:
def preprocess_text(text):
        
    # Lowercasing
    text = text.lower()
    
    # Removal of urls and html tags
    text = remove_urls_and_html_tags(text)
    
    #Removal of Numeric values
    text = re.sub(r'\d+', '', text) 
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation and non-alphabetic characters
    tokens = [token for token in tokens if token.isalpha()]
        
    # Selective removal of stopwords
    tokens = [token for token in tokens if token not in stop_words_without_sentiment]       
        
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)