In [1]:
# Step 1: Import Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from nltk.util import ngrams

# Download necessary NLTK resources (only needed once)
# Uncomment these lines if running for the first time
# nltk.download('punkt')
# nltk.download('stopwords')

# Step 2: Load the Dataset
file_path = "./processed_data.csv"  # Change this if needed
df = pd.read_csv(file_path, low_memory=False)

# Step 3: Define Stopwords and Precompile Regex Patterns
stop_words = set(stopwords.words('english'))
url_pattern = re.compile(r'http\S+|www\S+')
mention_pattern = re.compile(r'@\w+')
hashtag_pattern = re.compile(r'#\w+')
non_alpha_pattern = re.compile(r'[^a-zA-Z\s]')

# Step 4: Clean 'Text' Column
def clean_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()
        # Remove URLs, mentions, hashtags
        text = url_pattern.sub('', text)
        text = mention_pattern.sub('', text)
        text = hashtag_pattern.sub('', text)
        # Remove non-alphabetic characters
        text = non_alpha_pattern.sub(' ', text)
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        # Strip leading and trailing spaces
        text = text.strip()
        return text
    else:
        return ''

# Apply the cleaning function to 'Text' column
df['cleaned_text'] = df['Text'].astype(str).apply(clean_text)

# Step 5: Tokenize and Remove Stopwords
def tokenize_and_remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# Apply function to get 'cleaned_words' column
df['cleaned_words'] = df['cleaned_text'].apply(tokenize_and_remove_stopwords)

# Step 6: Generate N-grams from 'cleaned_words' Column
def generate_ngrams(words_list, n=2):
    # Generate n-grams only if there are enough words
    if len(words_list) >= n:
        return [' '.join(gram) for gram in ngrams(words_list, n)]
    else:
        return []

# Apply function to get 'cleaned_grams' column
df['cleaned_grams'] = df['cleaned_words'].apply(generate_ngrams)

# Step 7: Save Cleaned Data to a New CSV File
output_path = "./cleaned_data.csv"
df.to_csv(output_path, index=False)

# Step 8: Display Success Message and First Few Rows
print("✅ Tokenization and stopword removal complete!")
print("Cleaned data saved to:", output_path)
print(df[['Text', 'cleaned_text', 'cleaned_words', 'cleaned_grams']].head())

✅ Tokenization and stopword removal complete!
Cleaned data saved to: ./1st_cleaned_data.csv
                                                Text  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                                        cleaned_text  \
0  a that s a bummer you shoulda got david carr o...   
1  is upset that he can t update his facebook by ...   
2  i dived many times for the ball managed to sav...   
3     my whole body feels itchy and like its on fire   
4  no it s not behaving at all i m mad why am i h...   

                                       cleaned_words  \
0    [bummer, shoulda, got, david, carr, third, day]   
1  [upset, update, facebook, texting, might, cry,...   
2  [dived, many, times, ball, managed, save, rest...   
3         