 **Unzip the file**

In [2]:
import zipfile
import os

zip_file_path = 'easy_ham.zip'


extracted_dir = 'data/'

os.makedirs(extracted_dir, exist_ok=True)


with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)

for filename in os.listdir(extracted_dir):
    file_path = os.path.join(extracted_dir, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:

            content = file.read()

**cleaning data**

In [3]:
import re
import os

def clean_text(text):
  text = re.sub(r'[^\w\s]', '', text) #to remove ponctuation
  text = re.sub(r'\d+', '', text) #to remove spacing
  text = text.lower()

  return text

def clean_files_in_directory(directory):
    cleaned_files = []
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    cleaned_text = clean_text(text)
                    cleaned_files.append((file_path, cleaned_text))
            except UnicodeDecodeError:
                # Try opening the file with a different encoding
                with open(file_path, 'r', encoding='latin-1') as file:
                    text = file.read()
                    cleaned_text = clean_text(text)
                    cleaned_files.append((file_path, cleaned_text))
    return cleaned_files


root_directory = 'easy_ham'

cleaned_files = clean_files_in_directory(root_directory)

for file_path, cleaned_text in cleaned_files:
    print(f"File: {file_path}")
    print(f"Cleaned text: {cleaned_text}")
    print("---------------------------------------------")

**Tokenization**

In [4]:
import nltk
from nltk.tokenize import word_tokenize


nltk.download('punkt')


def tokenize_text(cleaned_text):
    tokens = word_tokenize(cleaned_text)
    tokens = [token.lower() for token in tokens] #to convert to lowercase
    return tokens


for file_path, cleaned_text in cleaned_files:

    tokens = tokenize_text(cleaned_text)

    print(f"Tokens for {file_path}:")
    print(tokens)
    print("---------------------------------------------")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**remove stopwords from the text token**

In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

all_tokens_without_stopwords = []

for file_path, cleaned_text in cleaned_files:

    tokens = word_tokenize(cleaned_text)

    tokens_without_stopwords = [token for token in tokens if token.lower() not in stop_words] #here is the tokens after removing stopwords

    all_tokens_without_stopwords.append((file_path, tokens_without_stopwords))

for file_path, tokens_without_stopwords in all_tokens_without_stopwords:
    print(f"File: {file_path}")
    print("Tokens without stopwords:")
    print(tokens_without_stopwords)
    print("---------------------------------------------")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
