In [None]:
import pandas as pd
import re
import nltk
import os
import shutil
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Define and prepare custom nltk data path: change 'user_name' to your actual username or desired path
# This is necessary for Windows users to avoid permission issues with the default nltk data directory
nltk_path = "C:/Users/user_name/nltk_data"
os.makedirs(nltk_path, exist_ok=True)
nltk.data.path.append(nltk_path)

# Optionally clear and redownload punkt if it's broken
punkt_dir = os.path.join(nltk_path, 'tokenizers', 'punkt')
if os.path.exists(punkt_dir):
    shutil.rmtree(punkt_dir)

# Force fresh download of required NLTK resources
nltk.download('punkt', download_dir=nltk_path)
nltk.download('punkt_tab', download_dir=nltk_path)
nltk.download('stopwords', download_dir=nltk_path)
nltk.download('wordnet', download_dir=nltk_path)
nltk.download('omw-1.4', download_dir=nltk_path)

# Load dataset
df = pd.read_csv('train.csv')

# Step 1: Quick cleaning using pandas
def fast_pandas_clean(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)       # Remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)            # Remove mentions & hashtags
    text = re.sub(r"[^\w\s]", "", text)              # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()         # Remove extra spaces
    return text

df['text_clean'] = df['text'].astype(str).apply(fast_pandas_clean)

# Step 2: Deeper NLP processing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    try:
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word.lower() not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    except LookupError as e:
        print("NLTK resource missing:", e)
        return text

df['text_final'] = df['text_clean'].apply(preprocess_text)

# Preview the first 5 results
# print(df[['text', 'text_clean', 'text_final']].head())

# Save the cleaned DataFrame to a new CSV file
df.to_csv("cleaned_disaster_tweets.csv", index=False)


[nltk_data] Downloading package punkt to C:/Users/Devil's
[nltk_data]     Playground/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:/Users/Devil's
[nltk_data]     Playground/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:/Users/Devil's
[nltk_data]     Playground/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:/Users/Devil's
[nltk_data]     Playground/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:/Users/Devil's
[nltk_data]     Playground/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
