## Section 1.1: Import necessary libraries

In [116]:
import pandas as pd
import re
import nltk

#### Section 1.1.0: creating list of files text to combined

In [117]:
filenames = ['whatsappchat-Bungoma.txt', 'whatsappchat-Msa.txt', 'whatsappchat-Nairobi.txt', 'whatsappchat-Wajiri.txt', 'whatsappchat-institution.txt']

#### Section 1.1.1: Creating a function to handle the logs to do custom error handling

In [118]:
# defining a function to handle bad lines
def handle_bad_line(line):
    with open('bad_lines.log', 'a') as f:
        f.write(str(line) + '\n')


#### Section 1.1.2: Looping through filename and passing handle_bad_line function to the filenames


In [140]:
# create an empty list to store the data frames
dfs = []

# loop through the filenames and read in the files
for filename in filenames:
    try:
        # read in the file and handle any bad lines
        df = pd.read_csv(filename, delimiter='\t', on_bad_lines=handle_bad_line, engine='python')
        dfs.append(df)
    except Exception as e:
        print(f"Error reading {filename}: {str(e)}")

In [120]:
#### Section 1.1.3: Concatenating the dataframes into one dataframe

In [121]:
# concatenate the data frames into a single data frame
combined_data = pd.concat(dfs, ignore_index=True)

#### Section 1.1.4: Removing the missing values

In [122]:
# fill in any missing values
combined_data = combined_data.fillna('')

#### Section 1.1.1: Combining the multiple text files into one file

In [123]:
combined_data.to_csv("combined-cpims-dataset.csv", index=False, sep="\t")

## Section 1.2: Loading our datasets from the combined whatsapp text file

In [124]:
whatsapp_data = pd.read_csv('combined-cpims-dataset.csv', sep='\r\n', header=None, names=['text'], engine='python')


## Section 1.3: Visualizing our datasets

In [125]:
whatsapp_data

Unnamed: 0,text
0,"21/09/2017, 9:20 pm - Messages and calls are e..."
1,"""21/09/2017, 9:08 pm - +254 708 778478 created..."
2,"21/09/2017, 9:20 pm - +254 708 778478 added you"
3,"21/09/2017, 9:23 pm - Joshua Mbai: Congratulat..."
4,"21/09/2017, 9:23 pm - Joshua Mbai: Thanks for ..."
...,...
6460,"24/01/2023, 2:15 pm - Margaret Kagwiria turned..."
6461,"24/01/2023, 5:00 pm - Margaret Kagwiria: This ..."
6462,"24/01/2023, 5:02 pm - Margaret Kagwiria: One o..."
6463,"24/01/2023, 5:29 pm - Gertrude Nyangweso: Plea..."


## Section 1.4: Data Cleansing
 - Removing urls from the text
 - Removing the phone numers
 - Removing the emojis
 
#### We should the cleanse our dataset before building our bot

In [126]:
## Removing the urls from the datasets
whatsapp_data['text'] = whatsapp_data['text'].apply(lambda x: re.sub(r'http\S+','', x))

In [127]:
## Removing the phone numbers from the datasets
whatsapp_data['text'] = whatsapp_data['text'].apply(lambda x: re.sub(r'\d{10}','', x))

In [128]:
## Removing the emojis from the text
whatsapp_data['text'] = whatsapp_data['text'].apply(lambda x: re.sub(r'[\U0001F600-\U0001F64F]', '', x))

## Section 1.5: Downloading the wordnet and punkt

In [129]:
# this is a nltk module that is used to find the meanings of words or synonyms 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gideo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [130]:
# this is a nltk module that is used to divide text into sentences
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gideo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Section 1.6: Tokenization of the text

In [131]:
## Tokenization is the converting of text into a list of words
## Here, we are 
whatsapp_data['text'] = whatsapp_data['text'].apply(lambda x: nltk.word_tokenize(x.lower()))

## Section 1.7: Downloading stopwords

In [132]:
## Downloading Stopwords
## Stopwords are common words which you do not want to use to describe the topic of your content eg. a, and
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gideo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##  Section 1.8: Importing stopwords

In [133]:
from nltk.corpus import stopwords

## Setting stop words for Englishhttps://www.datasciencelearner.com/typeerror-module-object-is-not-callable-fix/
stop_words = set(stopwords.words('english'))

## Section 1.9: Removing the stopwords

In [134]:
whatsapp_data['text'] = whatsapp_data['text'].apply(lambda x: [word for word in x if word not in stop_words])

## Section 2.0: Removing the special characters from whatsapp dataset

In [135]:
whatsapp_data['text'] = whatsapp_data['text'].apply(lambda x: [re.sub(r'[^a-zA-Z0-9]', '', word) for word in x])

## Section 2.1: Removing the empty strings from whatsapp datasets

In [136]:
whatsapp_data['text'] = whatsapp_data['text'].apply(lambda x: [word for word in x if len(word) > 0])

## Section 2.2: Saving the cleaned whatsapp_data as a text file

In [143]:
whatsapp_data.to_csv('cleaned_whatsapp_data.txt', index=False, header=None, sep='\n')

In [154]:
# Open input and output files
with open("cleaned_whatsapp_data.txt", "r") as input_file, open("output_file.txt", "w") as output_file:
    # Read input file as a single string
    document = input_file.read()
    
    # Remove "pm" from document
    cleaned_document = document.replace("pm", "")
    
    # Write modified document to output file
    output_file.write(cleaned_document)


In [155]:
import re

# Open input and output files
with open("cleaned_whatsapp_data.txt", "r") as input_file, open("output_file.txt", "w") as output_file:
    # Read input file as a single string
    document = input_file.read()
    
    # Remove "21092017" from document
    cleaned_document = re.sub(r'\d{8}', '', document)
    
    # Write modified document to output file
    output_file.write(cleaned_document)


In [156]:
cleaned_document



In [152]:
import re

# Open input and output files
with open("cleaned_whatsapp_data.txt", "r") as input_file, open("output_file.txt", "w") as output_file:
    # Read input file as a single string
    document = input_file.read()
    
    # Remove numbers from document
    cleaned_document = re.sub(r'\d+', '', document)
    
    # Write modified document to output file
    output_file.write(cleaned_document)



In [153]:
cleaned_document




## Section 2.3: Loading the cleaned whatsapp_data text file to convert it to nlu.md

In [None]:
cleaned_whatsapp_data = pd.read_csv('cleaned_whatsapp_data.txt', sep='\r\n', header=None, names=['text'], engine='python')

#viewing the cleaned dataset
cleaned_whatsapp_data