<a href="https://colab.research.google.com/github/MwangiAlma/Kenyan-Coffee-Farmer-Chatbot/blob/main/Notebooks/CoffeeTextprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Coffee Chatbot Text Pre-Processing

 The objective of this notebook is to build a functional multilingual chatbot using the preprocessed English and Swahili question data and their TF-IDF representations. This will involve:

1. User Input Preprocessing: Developing a function to clean and prepare new user queries, consistent with how the dataset was processed.
2. Language Detection: Implementing a method to automatically determine the language of an incoming user query (English or Swahili).
Similarity Search: Utilizing TF-IDF vectors and cosine similarity to identify the most relevant question within our dataset that matches the preprocessed user query.
3. Response Retrieval: Extracting and returning the appropriate English or Swahili response corresponding to the most similar question found.

In [None]:
!pip install python-docx transformers sentence-transformers scikit-learn pandas nltk langdetect


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.me

In [None]:
# Import the necessary libraries
import pandas as pd
import docx as Document
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect, DetectorFactory
import requests

DetectorFactory.seed = 0

# NLTK resources download. These ensure the needed linquistic resources are available.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Cell 2 - Data loading and standardization
try:
    df = pd.read_csv('/content/drive/MyDrive/Chatbot_dataset.csv', encoding='latin1') # Loads data to the dataset from CSV file.
    print("Dataset loaded successfully from Chatbot_dataset.csv")
    print("Original Columns in the DataFrame:")
    print(df.columns.tolist())

    # Renames the columns from the format in the CSV file to an easier format the file can understand.
    column_rename_map = {
        'Question (EN)': 'Question_EN',
        'Question (SW)': 'Question_SW',
        'Response (EN)': 'Response_EN',
        'Response (SW)': 'Response_SW'
    }
    df = df.rename(columns=column_rename_map)

    unnamed_cols = [col for col in df.columns if 'Unnamed:' in col]

    if unnamed_cols:
        df = df.drop(columns=unnamed_cols)
        print(f"\nDropped unnecessary columns: {unnamed_cols}")

    if 'Variations_EN' not in df.columns:
        df['Variations_EN'] = [[] for _ in range(len(df))]
        print("\nAdded 'Variations_EN' column (initialized as empty lists).")

    print("\nStandardized Columns in DataFrame after renaming and dropping:")
    print(df.columns.tolist())
    print("\nFirst 5 rows of the DataFrame (with standardized columns):")
    print(df.head())

    # defines list of column names
    required_cols = ['Question_EN', 'Question_SW', 'Response_EN', 'Response_SW', 'Variations_EN']
    for col in required_cols:
        if col not in df.columns:
            print(f"WARNING (After Standardization): Required column '{col}' is still missing!")
        elif df[col].isnull().all() or (df[col].astype(str).str.strip() == '').all():
            print(f"WARNING (After Standardization): Column '{col}' appears to be entirely empty or contains only whitespace/NaNs!")
        elif df[col].isnull().any() or (df[col].astype(str).str.strip() == '').any():
            print(f"Note (After Standardization): Column '{col}' contains some empty/NaN values.")

# Error handling
except FileNotFoundError: # the csv file doesn't exist
    print("Error: Chatbot_dataset.csv not found. Please upload it or ensure the path is correct.")
    df = pd.DataFrame(columns=['Question_EN', 'Question_SW', 'Response_EN', 'Response_SW', 'Variations_EN'])
except UnicodeDecodeError: # handles issues with decoding the csv file
    print("Error: Could not decode the CSV file. It might be saved with a different encoding than 'latin1'.")
    print("Try changing 'encoding='latin1'' to 'encoding='cp1252'' or 'encoding='iso-8859-1''.")
    df = pd.DataFrame(columns=['Question_EN', 'Question_SW', 'Response_EN', 'Response_SW', 'Variations_EN'])
except Exception as e: # handles any unexpected errors
    print(f"An unexpected error occurred during CSV loading or column standardization: {e}")
    df = pd.DataFrame(columns=['Question_EN', 'Question_SW', 'Response_EN', 'Response_SW', 'Variations_EN'])

Dataset loaded successfully from Chatbot_dataset.csv
Original Columns in the DataFrame:
['Topic', 'Question_EN', 'Question_SW', 'Response_EN', 'Response_SW']

Added 'Variations_EN' column (initialized as empty lists).

Standardized Columns in DataFrame after renaming and dropping:
['Topic', 'Question_EN', 'Question_SW', 'Response_EN', 'Response_SW', 'Variations_EN']

First 5 rows of the DataFrame (with standardized columns):
                        Topic  \
0  Planting Coffee Seedlings    
1  Planting Coffee Seedlings    
2  Planting Coffee Seedlings    
3  Planting Coffee Seedlings    
4  Planting Coffee Seedlings    

                                         Question_EN  \
0             When is the best time to plant coffee?   
1  What is the right season to plant coffee seedl...   
2                     When should I plant my coffee?   
3         Which months are best for planting coffee?   
4          Can I plant coffee during the dry season?   

                                   

In [None]:
# Cell 3 : Removing stopwords and Lemmatization

stop_words_en = set(stopwords.words('english')) # Gets a set of stopwords from NLTK corpus
lemmatizer_en = WordNetLemmatizer() # Reduces the words to their short form

try:
    swahili_stopwords_url = "https://raw.githubusercontent.com/dohliam/more-stoplists/master/sw/sw.txt" # Fetching swahili stopwords from this github link
    response = requests.get(swahili_stopwords_url)
    response.raise_for_status()

    stop_words_sw = set(response.text.strip().split('\n'))
    print("Swahili stopwords loaded successfully from GitHub (new URL).")

# Error handling
except requests.exceptions.RequestException as e:
    print(f"Error fetching Swahili stopwords from URL: {e}")
    print("Proceeding with an empty set for Swahili stopwords. Swahili preprocessing might be less effective.")
    stop_words_sw = set()
except Exception as e:
    print(f"An unexpected error occurred while loading Swahili stopwords: {e}")
    print("Proceeding with an empty set for Swahili stopwords. Swahili preprocessing might be less effective.")
    stop_words_sw = set()

print("NLTK Stopwords (English) and WordNetLemmatizer (English) initialized.")

Swahili stopwords loaded successfully from GitHub (new URL).
NLTK Stopwords (English) and WordNetLemmatizer (English) initialized.


In [None]:
# Cell 4 : Text preprocessing

def preprocess_english_text(text):
  if pd.isna(text): # Checks if the input is Not a Number
    return ""
  text = str(text).lower()
  words = nltk.word_tokenize(text)
  words = nltk.word_tokenize(text)
  words = [lemmatizer_en.lemmatize(word) for word in words if word.isalpha() and word not in stop_words_en]
  return "".join(words)

def preprocess_swahili_text(text):
  if pd.isna(text):
    return ""
  text = re.sub(r'[^a-z\s]', '', text)
  words = text.split()

  # Filters out Swahili stop_words
  words = [word for word in words if word not in stop_words_sw]
  return " ".join(words)

# Checks if 'Question_EN' column exixts in the DataFrame
if 'Question_EN' in df.columns:
  df['Processed_Question_EN'] = df['Question_EN'].apply(preprocess_english_text)
  print("English text preprocessing applied to 'Question_EN' column")
  print("\nFirst 5 rows with 'Questions_EN' and 'Processed_Question_EN':")
  print(df[['Question_EN', 'Processed_Question_EN']].head()) # Prints the first five english questions and the lemmatized form
else:
  print("Error: 'Question_EN' column not found. Cannot apply English preprocessing")

print("-" * 50)

# Checks if 'Question_SW' column exists in the DataFrame
if 'Question_SW' in df.columns:
  df['Processed_Question_SW'] = df['Question_SW'].apply(preprocess_swahili_text)
  print("Swahili text preprocessing applied to 'Question_SW' column.")
  print("\nFirst 5 rows with 'Questions_SW' and 'Processed_Question_SW':")
  print(df[['Question_SW', 'Processed_Question_SW']].head()) # Prints the first five swahili questions and the lemmatized form
else:
  print("Error: 'Question_SW' column not found. Cannot apply Swahili preprocessing.")


English text preprocessing applied to 'Question_EN' column

First 5 rows with 'Questions_EN' and 'Processed_Question_EN':
                                         Question_EN  \
0             When is the best time to plant coffee?   
1  What is the right season to plant coffee seedl...   
2                     When should I plant my coffee?   
3         Which months are best for planting coffee?   
4          Can I plant coffee during the dry season?   

            Processed_Question_EN  
0             besttimeplantcoffee  
1  rightseasonplantcoffeeseedling  
2                     plantcoffee  
3         monthbestplantingcoffee  
4            plantcoffeedryseason  
--------------------------------------------------
Swahili text preprocessing applied to 'Question_SW' column.

First 5 rows with 'Questions_SW' and 'Processed_Question_SW':
                                     Question_SW  \
0  Ni wakati gani mzuri zaidi wa kupanda kahawa?   
1     Ni msimu gani wa kupanda mchele wa kahawa

In [None]:
def preprocess_english_text(text):
    """
    Preprocesses English text: converts to lowercase and removes punctuation.
    """
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

def preprocess_swahili_text(text):
    """
    Preprocesses Swahili text: converts to lowercase and removes punctuation.
    """
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# This ensures the 'Processed_Question_EN' and 'Processed_Question_SW' columns exist.
df['Processed_Question_EN'] = df['Question_EN'].apply(preprocess_english_text)

if 'Question_SW' in df.columns:
    df['Processed_Question_SW'] = df['Question_SW'].apply(preprocess_swahili_text)
else:
    print("Warning: 'Question_SW' column not found, skipping Swahili preprocessing.")
    df['Processed_Question_SW'] = "" # Create an empty column to prevent errors later

if 'Variations_EN' in df.columns and not df['Variations_EN'].empty:
    df['Processed_Variations_EN'] = df['Variations_EN'].apply(
        lambda variations: [preprocess_english_text(v) for v in variations if pd.notna(v)] if isinstance(variations, list) else []
    )
else:
    print("Warning: 'Variations_EN' column not found or is empty. Skipping preprocessing for variations.")

# Initializes TF-IDF vectorizer for English, max_features considers the top 5000 most frequent terms
tfidf_vectorizer_en = TfidfVectorizer(max_features=5000)

corpus_en = df['Processed_Question_EN'].tolist()

if 'Processed_Variations_EN' in df.columns:
    for variations_list in df['Processed_Variations_EN']:
        if variations_list is not None:
            for variation in variations_list:
                if pd.notna(variation) and variation.strip():
                    corpus_en.append(variation)

# Fits a TF-IDF vectorizer to the English corpus (questions + variations) then transforms the questions to a TF-IDF matrix.
# Learns the vocabulary and IDF values and then transforms them to numerical vectors
tfidf_matrix_en_full = tfidf_vectorizer_en.fit_transform(corpus_en)
tfidf_matrix_questions_en = tfidf_vectorizer_en.transform(df['Processed_Question_EN'])

print("English TF-IDF Vectorization complete.")
print(f"English TF-IDF matrix shape (for all processed text, including variations): {tfidf_matrix_en_full.shape}")
print(f"English TF-IDF matrix shape (for questions): {tfidf_matrix_questions_en.shape}")

print("-" * 50)

# Initializes TF-IDF vectorizer for Swahili, max_features considers the top 5000 most frequent terms
tfidf_vectorizer_sw = TfidfVectorizer(max_features=5000)

if 'Processed_Question_SW' in df.columns:
    corpus_sw = df['Processed_Question_SW'].tolist()
    tfidf_matrix_sw_full = tfidf_vectorizer_sw.fit_transform(corpus_sw)
    tfidf_matrix_questions_sw = tfidf_vectorizer_sw.transform(df['Processed_Question_SW'])

    print("Swahili TF-IDF Vectorization complete.")
    print(f"Swahili TF-IDF matrix shape (for all processed text): {tfidf_matrix_sw_full.shape}")
    print(f"Swahili TF-IDF matrix shape (for processed questions only): {tfidf_matrix_questions_sw.shape}")

else:
    print("Warning: 'Processed_Question_SW' column not found. Swahili TF-IDF not created")
    tfidf_vectorizer_sw = None
    tfidf_matrix_questions_sw = None

English TF-IDF Vectorization complete.
English TF-IDF matrix shape (for all processed text, including variations): (448, 607)
English TF-IDF matrix shape (for questions): (448, 607)
--------------------------------------------------
Swahili TF-IDF Vectorization complete.
Swahili TF-IDF matrix shape (for all processed text): (448, 846)
Swahili TF-IDF matrix shape (for processed questions only): (448, 846)
