In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator
from spellchecker import SpellChecker
from emoji import demojize

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_and_spellcheck(tweet):
    if isinstance(tweet, str):
        # Remove URLs
        tweet = re.sub(r'http\S+', '', tweet)
        
        # Remove user mentions
        tweet = re.sub(r'@\w+', '', tweet)
        
        # Lowercasing
        tweet = tweet.lower()
        
        # Remove special characters and punctuation
        tweet = re.sub(r'[' + re.escape(string.punctuation) + ']', '', tweet)
        
        # Remove emojis
        tweet = demojize(tweet)
        tweet = re.sub(r':[a-z_]+:', '', tweet)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = tweet.split()
        tweet = ' '.join([word for word in words if word.lower() not in stop_words])
        
        # Tokenization (split the text into words)
        tokens = nltk.word_tokenize(tweet)
        
        # Remove non-alphanumeric characters
        tokens = [word for word in tokens if word.isalnum()]
        
        # Spell checking
        spell = SpellChecker()
        tokens = [spell.correction(word) if spell.correction(word) is not None else word for word in tokens]
        
        # Remove short words
        tokens = [word for word in tokens if len(word) > 2]
        
        # Rejoin the tokens into a single string
        preprocessed_tweet = ' '.join(tokens)
        
        return preprocessed_tweet
    else:
        return tweet

# List of CSV file names
csv_files = ['tripura.csv', 'tel.csv', 'uttarakhand.csv', 'bengal.csv', 'tamil.csv', 'sikkim.csv', 'raj.csv', 'punjab.csv', 'odisha.csv', 'nagaland.csv', 'mizoram.csv', 'meg.csv', 'manipur.csv', 'maharastra.csv', 'madhya.csv', 'kerala.csv', 'karnataka.csv', 'jharkhand.csv', 'himachal.csv', 'haryana.csv', 'gujarat.csv', 'goa.csv', 'chat.csv', 'bihar.csv', 'assam.csv', 'arunanchal.csv', 'andhra.csv', 'uttarpradesh.csv', 'delhi.csv']  # Add your file names

for csv_file_name in csv_files:
    # Load the CSV file into a pandas DataFrame with low_memory=False
    df = pd.read_csv(csv_file_name, low_memory=False)

    # Remove duplicate rows based on the 'text' column
    df = df.drop_duplicates(subset=['text'])

    # Apply preprocessing, spell checking, and remove emojis to each tweet in the 'text' column
    df['text'] = df['text'].apply(preprocess_and_spellcheck)

    # Save the modified DataFrame to a new CSV file
    processed_csv_file_path = f"{csv_file_name.split('.')[0]}_clean.csv"
    df.to_csv(processed_csv_file_path, index=False)

    print(f"Processed data (with duplicates removed) saved to {processed_csv_file_path}")


In [5]:
import pandas as pd
import os

# List of CSV file names
csv_files = ['tripura_clean.csv', 'tel_clean.csv', 'uttarakhand_clean.csv', 'bengal_clean.csv', 'tamil_clean.csv', 'sikkim_clean.csv', 'raj_clean.csv', 'punjab_clean.csv', 'odisha_clean.csv', 'nagaland_clean.csv', 'mizoram_clean.csv', 'meg_clean.csv', 'manipur_clean.csv', 'maharastra_clean.csv', 'madhya_clean.csv', 'kerala_clean.csv', 'karnataka_clean.csv', 'jharkhand_clean.csv', 'himachal_clean.csv', 'haryana_clean.csv', 'gujarat_clean.csv', 'goa_clean.csv', 'chat_clean.csv', 'bihar_clean.csv', 'assam_clean.csv', 'arunanchal_clean.csv', 'andhra_clean.csv', 'uttarpradesh_clean.csv', 'delhi_clean.csv']  # Add your file names

# Initialize an empty list to store DataFrames
data_frames = []

# Get the current working directory
current_directory = os.getcwd()

# Iterate through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(current_directory, file)
    df = pd.read_csv(file_path)
    data_frames.append(df)

# Concatenate all DataFrames in the list along the rows (axis=0)
combined_data = pd.concat(data_frames, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_data.to_csv('combined_clean.csv', index=False)

print("CSV files have been successfully combined.")


CSV files have been successfully combined.
