In [None]:
# Re-import libraries and redefine file paths
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Define file paths
train_file_path = '/mnt/data/train.csv'
test_file_path = '/mnt/data/test.csv'

# Load the data
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Display the first few rows of each dataframe and their structure
train_data.head(), train_data.columns, test_data.head(), test_data.columns

In [None]:
# Setting up NLTK with local resources
nltk.data.path.append('/mnt/data/nltk_data/')  # Specifying a custom path for pre-loaded NLTK resources

# Load NLTK resources necessary for the tasks
nltk.download('punkt', download_dir='/mnt/data/nltk_data/')  # Tokenizers
nltk.download('stopwords', download_dir='/mnt/data/nltk_data/')  # Stopwords
nltk.download('wordnet', download_dir='/mnt/data/nltk_data/')  # Lemmatizer

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lower case
    text = text.lower()
    # Remove non-alphabetic characters and extra spaces
    text = re.sub('[^a-z\s]', ' ', text)
    text = re.sub(' +', ' ', text).strip()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    text = ' '.join(tokens)
    return text

# Apply preprocessing to both train and test data
train_data['clean_text'] = train_data['full_text'].apply(preprocess_text)
test_data['clean_text'] = test_data['full_text'].apply(preprocess_text)

# Display first few rows to verify preprocessing
train_data[['full_text', 'clean_text']].head(), test_data[['full_text', 'clean_text']].head()
