In [31]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator
from spellchecker import SpellChecker
from emoji import demojize

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_and_spellcheck(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    
    # Remove user mentions
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Lowercasing
    tweet = tweet.lower()
    
    # Remove special characters and punctuation
    tweet = re.sub(r'[' + re.escape(string.punctuation) + ']', '', tweet)
    
    # Remove emojis
    tweet = demojize(tweet)
    tweet = re.sub(r':[a-z_]+:', '', tweet)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = tweet.split()
    tweet = ' '.join([word for word in words if word.lower() not in stop_words])
    
    # Tokenization (split the text into words)
    tokens = nltk.word_tokenize(tweet)
    
    # Remove non-alphanumeric characters
    tokens = [word for word in tokens if word.isalnum()]
    
    # Spell checking
    spell = SpellChecker()
    tokens = [spell.correction(word) if spell.correction(word) is not None else word for word in tokens]
    
    # Remove short words
    tokens = [word for word in tokens if len(word) > 2]
    
    # Rejoin the tokens into a single string
    preprocessed_tweet = ' '.join(tokens)
    
    return preprocessed_tweet

# Load the CSV file into a pandas DataFrame
csv_file_name = 'west_bengal390.csv'  # Replace with your actual file name
df = pd.read_csv(csv_file_name)

# Remove duplicate rows based on the 'text' column
df = df.drop_duplicates(subset=['text'])

# Apply preprocessing, spell checking, and remove emojis to each tweet in the 'text' column
df['text'] = df['text'].apply(preprocess_and_spellcheck)

# Save the modified DataFrame to a new CSV file
processed_csv_file_path = 'west_bengal_clean.csv'
df.to_csv(processed_csv_file_path, index=False)

print(f"Processed data (with duplicates removed) saved to {processed_csv_file_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed data (with duplicates removed) saved to west_bengal_clean.csv
