# Modules

In [13]:
import pandas as pd
import numpy as np
import csv
import time
from tqdm import tqdm

# For Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from langdetect import detect
import emoji
import gensim.downloader as api
# Load pre-trained word vectors
word_vectors = api.load('glove-wiki-gigaword-100')
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os

# Data Preprocessing

1. Lowercasing
2. Tokenization
3. Stopword removal
4. Stemming/Lemmatization
5. Language Detection
6. Removal of emojis if applicable
7. Negation Handling
8. Word Embeddings
9. Padding and Truncation
10. Data splitting: Training, Validation & Test Sets

## Lowercasing, Tokenization, Stopword Removal, Lemmatization, Negation Handling & Word Embeddings

In [8]:
# Load existing data from CSV
try:
    df = pd.read_csv('/workspaces/Project-Uchumi/data/raw/articles.csv')
except FileNotFoundError:
    df = pd.DataFrame(columns=['url', 'article', 'date'])

# Language detection for providence of Swahili data
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

df['language'] = df['article'].apply(detect_language)

# Define stopwords
english_stopwords = set(stopwords.words('english'))
swahili_stopwords = [
    "na", "za", "kwa", "ya", "ndani", "je", "ni", "hata", "pia", "au", "wakati",
    "hivyo", "nini", "kama", "bila", "kisha", "sasa", "yake", "yao", "hizo",
    "zao", "yao", "yenu", "yake", "zake", "lakini", "au", "nao", "wao",
    "yao", "yake", "kwenda", "kuwa", "kuwa", "wao", "naye", "ninyi",
    "huku", "yako", "basi", "kabla", "kutoka", "katika", "mimi",
    "yako", "kweli", "kabisa", "hasa", "hapo", "hata", "hivyo",
    "mbali", "mara", "zaidi", "karibu", "kila", "mmoja", "mwingine",
    "nyingine", "wengine", "yoyote", "wote", "huyo", "huo", "kwamba",
    "lakini", "mbali", "mimi", "mmoja", "muda", "mwenyewe", "naam",
    "pamoja", "sana", "sasa", "sisi", "vile", "wa", "wakati", "wake",
    "wakiwa", "wana", "wao", "watu", "wengine", "wote", "ya", "yake",
    "yangu", "yao", "yeye", "yule", "za", "zaidi", "zake"
]

# Example usage
# Assuming 'text' was the input text
# stopwords_removed_text = ' '.join([word for word in text.split() if word.lower() not in swahili_stopwords])


# Define stemmer and lemmatizer for nltk.stem
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Negation handling for sentiment analysis
negation_words = ["no", "not", "never", "none", "nobody", "nowhere", "nothing", "neither", "nor", "cannot", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "mustn't"]

# Define the <UNK> token
UNK_TOKEN = '<UNK>'
UNK_VECTOR = np.zeros_like(word_vectors.get_vector(word_vectors.index_to_key[0]))

# Add the <UNK> token to your word vectors
word_vectors[UNK_TOKEN] = UNK_VECTOR

# Function for preprocessing
def process_tokens(text, language):
    # Remove special characters, numbers, and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords and perform stemming or lemmatization
    if language == 'en':
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in english_stopwords]
    elif language == 'sw':
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in swahili_stopwords]

    negated = False
    processed_tokens = []
    for token in tokens:
        if token in negation_words:
            negated = not negated
        elif negated:
            processed_tokens.append('NOT_' + token)
        else:
            processed_tokens.append(token)


    return processed_tokens

def get_word_embeddings(processed_tokens):
    # Get word embeddings for each token
    embeddings = []
    for token in processed_tokens:
        if token in word_vectors:
            embeddings.append(word_vectors[token])
        else:
            # Handle out-of-vocabulary words
            embeddings.append(word_vectors['<UNK>'])  # Use a special token for unknown words

    return embeddings

    # Reconstruct the text

def preprocess_text(text, language):
    processed_tokens = process_tokens(text, language)
    cleaned_article = ' '.join(processed_tokens)
    word_vectors = get_word_embeddings(processed_tokens)

    return cleaned_article, word_vectors


# Apply preprocessing to each article
df['cleaned_article'], df['word_vectors'] = zip(*df.apply(lambda row: preprocess_text(row['article'], row['language']), axis=1))

# Print the preprocessed data
print(df[['article', 'cleaned_article', 'word_vectors']])


                                              article  \
0   Hopes by Kakamega Deputy Governor Ayub Savula ...   
1   What you need to know:\n- Only a few individua...   
2   What you need to know:\n- According to Makueni...   
3   One of the demonstrators who allegedly entered...   
4   What you need to know:\n- Kasaine was reported...   
5   The National Police Service (NPS) has refuted ...   
6   What you need to know:\n- The groups said if t...   
7   What you need to know:\n- The scheme covers na...   
8   Complexities of Finance Bills have primarily b...   
9   The Finance Bill, 2024 has elicited anger amon...   
10  When the history of the Duruma community is fi...   
11  What you need to know:\n- Widows in Africa cou...   
12  Mr Gachagua, however, says the issues between ...   
13  What you need to know:\n- The Deputy President...   
14  What you need to know:\n- Omtatah argues that ...   
15  What you need to know:\n- Khat is also cultiva...   
16  What you need to know:\n- T

## Emoji removal (Ignore for now)

In [9]:
# Emoji Removal for future if applicable in say tweets or comments or Linkedin Posts

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Example usage
text_with_emojis = "Hello! 😀🚀🌟"
text_without_emojis = remove_emojis(text_with_emojis)
print(text_without_emojis)

Hello! 


## Padding & Truncation

In [10]:
# Set the maximum sequence length
max_length = 1000
# Pad and truncate the sequences
padded_sequences = pad_sequences(df['word_vectors'], maxlen=max_length, padding='post', truncating='post')
padded_sequences

array([[[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0, -1,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       ...,

       [[ 0,  0,  0, ...,  0,  1,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0

## Data Splitting

In [11]:
# Split the data into training/validation and test sets
X_train_val, X_test = train_test_split(padded_sequences, test_size=0.2, random_state=42)

# Split the training/validation set into training and validation sets
X_train, X_val = train_test_split(X_train_val, test_size=0.5, random_state=42)


## Export Pre-processed Data to CSV

In [14]:
df.to_csv("/workspaces/Project-Uchumi/data/processed/preprocessed_articles.csv", index=False, mode='a', header=not os.path.exists("/workspaces/Project-Uchumi/data/processed/preprocessed_articles.csv"))