In [1]:
# Import necessary libraries
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords

In [2]:
# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Define a function to process texts
def preprocess_text(text):
    # Get a collection of English stopwords
    stop_words = set(stopwords.words('english'))
    # Tokenization
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    # POS tagging
    pos_tags = pos_tag(tokens)
    return tokens, lemmas, pos_tags

In [4]:
# Process each file
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokens, lemmas, pos_tags = preprocess_text(text)
        cleaned_text = ' '.join(tokens)  
        return text, cleaned_text, tokens, lemmas, pos_tags

In [5]:
# Create a list to store data
data = []

# Acquire the folder path
folder_path = r'C:\Users\ASUS\Desktop\CD_Individual Project\Data'  

In [6]:
# Loop through all the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        # Call the function for each file
        text, cleaned_text, tokens, lemmas, pos_tags = process_file(file_path)
        # Add data
        data.append({
            'Filename': filename,
            'Document': text,
            'Text': cleaned_text,
            'Tokens': tokens,
            'Lemmas': lemmas,
            'Parts-of-speech': pos_tags
        })

In [7]:
# Create DataFrame
df = pd.DataFrame(data)

In [8]:
# Save DataFrame as a CSV
csv_filename = 'output.csv'
df.to_csv(csv_filename, index=False)