# Modules

In [2]:
import pandas as pd
import numpy as np
import csv
import time
from tqdm import tqdm

# For Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from langdetect import detect
import emoji
import gensim.downloader as api
# Load pre-trained word vectors
word_vectors = api.load('glove-wiki-gigaword-100')
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os

2024-07-03 18:49:05.261658: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-03 18:49:06.301013: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-03 18:49:06.709099: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 18:49:07.356128: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 18:49:07.360157: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-03 18:49:08.457085: I tensorflow/core/platform/cpu_feature_guard.cc:

# Data Preprocessing

1. Lowercasing
2. Tokenization
3. Stopword removal
4. Stemming/Lemmatization
5. Language Detection
6. Removal of emojis if applicable
7. Negation Handling
8. Word Embeddings
9. Padding and Truncation
10. Data splitting: Training, Validation & Test Sets

## Lowercasing, Tokenization, Stopword Removal, Lemmatization, Negation Handling & Word Embeddings

In [2]:
# Load existing data from CSV
try:
    df = pd.read_csv('/workspaces/Project-Uchumi/data/raw/articles.csv')
except FileNotFoundError:
    df = pd.DataFrame(columns=['url', 'article', 'date'])

# Language detection for providence of Swahili data
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

df['language'] = df['article'].apply(detect_language)

# Define stopwords
english_stopwords = set(stopwords.words('english'))
swahili_stopwords = [
    "na", "za", "kwa", "ya", "ndani", "je", "ni", "hata", "pia", "au", "wakati",
    "hivyo", "nini", "kama", "bila", "kisha", "sasa", "yake", "yao", "hizo",
    "zao", "yao", "yenu", "yake", "zake", "lakini", "au", "nao", "wao",
    "yao", "yake", "kwenda", "kuwa", "kuwa", "wao", "naye", "ninyi",
    "huku", "yako", "basi", "kabla", "kutoka", "katika", "mimi",
    "yako", "kweli", "kabisa", "hasa", "hapo", "hata", "hivyo",
    "mbali", "mara", "zaidi", "karibu", "kila", "mmoja", "mwingine",
    "nyingine", "wengine", "yoyote", "wote", "huyo", "huo", "kwamba",
    "lakini", "mbali", "mimi", "mmoja", "muda", "mwenyewe", "naam",
    "pamoja", "sana", "sasa", "sisi", "vile", "wa", "wakati", "wake",
    "wakiwa", "wana", "wao", "watu", "wengine", "wote", "ya", "yake",
    "yangu", "yao", "yeye", "yule", "za", "zaidi", "zake"
]

# Example usage
# Assuming 'text' was the input text
# stopwords_removed_text = ' '.join([word for word in text.split() if word.lower() not in swahili_stopwords])


# Define stemmer and lemmatizer for nltk.stem
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Negation handling for sentiment analysis
negation_words = ["no", "not", "never", "none", "nobody", "nowhere", "nothing", "neither", "nor", "cannot", "can't", "won't", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "mustn't"]

# Define the <UNK> token
UNK_TOKEN = '<UNK>'
UNK_VECTOR = np.zeros_like(word_vectors.get_vector(word_vectors.index_to_key[0]))

# Add the <UNK> token to your word vectors
word_vectors[UNK_TOKEN] = UNK_VECTOR

# Function for preprocessing
def process_tokens(text, language):
    # Remove special characters, numbers, and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords and perform stemming or lemmatization
    if language == 'en':
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in english_stopwords]
    elif language == 'sw':
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in swahili_stopwords]

    negated = False
    processed_tokens = []
    for token in tokens:
        if token in negation_words:
            negated = not negated
        elif negated:
            processed_tokens.append('NOT_' + token)
        else:
            processed_tokens.append(token)


    return processed_tokens

def get_word_embeddings(processed_tokens):
    # Get word embeddings for each token
    embeddings = []
    for token in processed_tokens:
        if token in word_vectors:
            embeddings.append(word_vectors[token])
        else:
            # Handle out-of-vocabulary words
            embeddings.append(word_vectors['<UNK>'])  # Use a special token for unknown words

    return embeddings

    # Reconstruct the text

def preprocess_text(text, language):
    processed_tokens = process_tokens(text, language)
    cleaned_article = ' '.join(processed_tokens)
    word_vectors = get_word_embeddings(processed_tokens)

    return cleaned_article, word_vectors


# Apply preprocessing to each article
df['cleaned_article'], df['word_vectors'] = zip(*df.apply(lambda row: preprocess_text(row['article'], row['language']), axis=1))

# Print the preprocessed data
print(df[['article', 'cleaned_article', 'word_vectors']])


                                              article  \
0   A disagreement between two widows in Kakamega ...   
1   Meru County government is yet to pass its 2024...   
2   What you need to know:\n- At least 23 youths w...   
3   What you need to know:\n- Residents feel autho...   
4   What you need to know:\n- Dish out money and g...   
5   Hi Pastor,\nMy step dad married my biological ...   
6   What you need to know:\n- Gen Zs' gamophobia —...   
7   What you need to know:\n- Irrespective of octa...   
8   What you need to know:\n- Self-care is wholeso...   
9   Somali President Hassan Sheikh Mohamud came to...   
11  Trapped in a Catholic mission sheltering dozen...   
12  What you need to know:\n- In Nairobi on Tuesda...   
13  What you need to know:\n- Learning resumed on ...   
14  What you need to know:\n- Despite the normalis...   
15  What you need to know:\n- I had my first ever ...   
16  What you need to know:\n- Boda boda riders, on...   
17  President William Ruto's es

## Emoji removal (Ignore for now)

In [3]:
# Emoji Removal for future if applicable in say tweets or comments or Linkedin Posts

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Example usage
text_with_emojis = "Hello! 😀🚀🌟"
text_without_emojis = remove_emojis(text_with_emojis)
print(text_without_emojis)

Hello! 


## Padding & Truncation

In [4]:
# Set the maximum sequence length
max_length = 1000
# Pad and truncate the sequences
padded_sequences = pad_sequences(df['word_vectors'], maxlen=max_length, padding='post', truncating='post')
padded_sequences

array([[[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  1, -1],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       ...,

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0

## Data Splitting

In [5]:
# Split the data into training/validation and test sets
X_train_val, X_test = train_test_split(padded_sequences, test_size=0.2, random_state=42)

# Split the training/validation set into training and validation sets
X_train, X_val = train_test_split(X_train_val, test_size=0.5, random_state=42)


## Export Pre-processed Data to CSV

In [6]:
df.to_csv("/workspaces/Project-Uchumi/data/processed/preprocessed_articles.csv", index=False, mode='a', header=not os.path.exists("/workspaces/Project-Uchumi/data/processed/preprocessed_articles.csv"))

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Sample DataFrame (assuming `df` is your DataFrame)
df1 = pd.DataFrame({
    'cleaned_article': ["disagreement two widow kakamega bury remains husband fuelled tension family resulting death injury destruction property burial late joseph otipa osundwa died april scheduled june mumias senior resident magistrate marcella onyango issued order june body collected mortuary st mary mission hospital mumias friday june taken first wife home burial saturday per court order family first wife selpha maende buried mr otipas body home accordance luhya custom court also ordered burial cost borne deceased two wife child relative mortuary bill would shared equally widow family dispute direct party bear cost mortuary expense shared equally directed m onyango however family second wife margaret otipa furious court decision grant burial right first family claimed deceased died home spent time life father living mother year unfair court give body stepmother bury citing luhya custom tradition said one son day burial hell broke loose first family went burial site bury remains deceased margaret child interrupted procession sparking violent confrontation coffin containing mr otipas remains suddenly thrown ground pallbearer ran safety ensuing clash resulted death yearold mohammed anyanga brother deceased mr anyanga attacked one nephew trying separate warring party rushed kakamega county general hospital ct scan revealed suffered fractured skull blood clot brain later died undergoing treatment speaking father death son patrick anyanga said life would NOT_two NOT_family NOT_father NOT_brother NOT_deceased NOT_every NOT_right NOT_intervene NOT_bring NOT_peace NOT_warring NOT_family NOT_wrong NOT_young NOT_man NOT_knew NOT_uncle NOT_raise NOT_panga NOT_hack NOT_death NOT_lamented NOT_mumias NOT_east NOT_subcounty NOT_police NOT_commander NOT_doris NOT_chemosi NOT_confirmed NOT_incident NOT_said NOT_investigation NOT_begun NOT_detective NOT_already NOT_questioning NOT_recording NOT_statement NOT_attended NOT_burial NOT_ceremony NOT_witnessed NOT_deadly NOT_clash NOT_expect NOT_make NOT_arrest NOT_charge NOT_people NOT_killing NOT_old NOT_man NOT_soon NOT_complete NOT_investigation NOT_m NOT_chemosi NOT_said NOT_body NOT_deceased NOT_taken NOT_st NOT_mary NOT_mission NOT_hospital NOT_mortuary NOT_expected NOT_remain NOT_warring NOT_wife NOT_agree NOT_burial NOT_site"]
})

# Vectorize the text data
vectorizer = CountVectorizer(max_df=1.0, min_df=1, stop_words='english')
X = vectorizer.fit_transform(df1['cleaned_article'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Assign topics to articles
topic_results = lda.transform(X)
df1['dominant_topic'] = topic_results.argmax(axis=1)

# Topic probabilities
df1['topic_probability'] = topic_results.max(axis=1)

print(df1)

                                     cleaned_article  dominant_topic  \
0  disagreement two widow kakamega bury remains h...               4   

   topic_probability  
0           0.997012  
