Install Dependencies

In [None]:
!pip install pandas nltk scikit-learn gensim


Import Libraries

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import gensim.downloader as api

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


 Load the Dataset

In [None]:
# Load the Excel dataset
df = pd.read_excel('your_dataset.xlsx')

# Check the first few rows of the dataset
df.head()


 Data Cleaning

In [None]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove links
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# Apply cleaning to the 'text' column
df['text'] = df['text'].apply(lambda x: clean_text(str(x)))

# Handle missing values (remove rows with missing text)
df.dropna(subset=['text'], inplace=True)

# Preview cleaned data
df.head()


Text Normalization (Lowercase, Tokenization, Stop Words, Lemmatization)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to normalize text
def normalize_text(text):
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply normalization to the 'text' column
df['text'] = df['text'].apply(lambda x: normalize_text(x))

# Preview normalized data
df.head()


Split the Dataset

In [None]:
# Split dataset into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

print(f'Training Set: {len(train_df)} samples')
print(f'Validation Set: {len(val_df)} samples')
print(f'Test Set: {len(test_df)} samples')


GloVe Word Embeddings

In [None]:
# Load GloVe embeddings
glove = api.load('glove-wiki-gigaword-100')  # 100-dimensional embeddings

# Function to get GloVe vectors for each word in the text
def get_glove_embeddings(text):
    words = text.split()
    word_vectors = [glove[word] for word in words if word in glove]
    if len(word_vectors) > 0:
        return sum(word_vectors) / len(word_vectors)  # Return average of word vectors
    else:
        return [0] * 100  # Return zero vector if no words in GloVe

# Apply GloVe embeddings to 'text' column
train_df['glove_vectors'] = train_df['text'].apply(lambda x: get_glove_embeddings(x))
val_df['glove_vectors'] = val_df['text'].apply(lambda x: get_glove_embeddings(x))
test_df['glove_vectors'] = test_df['text'].apply(lambda x: get_glove_embeddings(x))

# Check the new glove_vectors column
train_df.head()


Save the Processed Data

In [None]:
# Save the processed data to CSV
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)
