In [None]:
import os
import re
import string
import pickle
import pandas as pd
import numpy as np
import nltk
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords

# Download stopwords if not already available
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Clean the input text by:
      - Removing URLs.
      - Removing punctuation.
      - Converting text to lowercase.
      - Removing stopwords.
      - Removing extra spaces.
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back to a single string
    cleaned_text = " ".join(tokens)
    return cleaned_text


test_df = pd.read_csv(r'C:\Users\hp\Downloads\nlp ass2\test (2).csv')

test_df['clean_tweet'] = test_df['tweet'].apply(clean_text)

test_df_prepro = test_df[['tweet', 'clean_tweet']].copy()

test_df_prepro.to_csv('test_prepro.csv', index=False)
print("Preprocessed test data saved as 'test_prepro.csv'")


tokenizer_filename = 'tokenizer.pickle'
max_words = 10000  
max_len = 100      

if os.path.exists(tokenizer_filename):
    with open(tokenizer_filename, 'rb') as handle:
        tokenizer = pickle.load(handle)
    print("Loaded tokenizer from '{}'".format(tokenizer_filename))
else:
    print("Tokenizer file not found. Fitting a new tokenizer on 'train_prepro.csv'.")
    train_df_prepro = pd.read_csv('train_prepro.csv')  
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_df_prepro['clean_tweet'])
    with open(tokenizer_filename, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("New tokenizer fitted and saved as '{}'".format(tokenizer_filename))

sequences_test = tokenizer.texts_to_sequences(test_df_prepro['clean_tweet'])

X_test = pad_sequences(sequences_test, maxlen=max_len)

print("Shape of tokenized and padded test data:", X_test.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed test data saved as 'test_prepro.csv'
Tokenizer file not found. Fitting a new tokenizer on 'train_prepro.csv'.
New tokenizer fitted and saved as 'tokenizer.pickle'
Shape of tokenized and padded test data: (4957, 100)


In [None]:
print("Class distribution in training data:")
print(train_df_prepro['class'].value_counts(normalize=True))

Class distribution in training data:
class
1    0.774639
2    0.167860
0    0.057500
Name: proportion, dtype: float64
