# TEXT DATA PREPROCESSING

## 1. Imports and settings

Import required libraries and configure display options.


In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt

# Display settings
pd.set_option('display.max_columns', 200)
%matplotlib inline

## 2. Load dataset

Load a CSV file into a DataFrame. Change DATA_PATH to your file path.


In [None]:
df = pd.read_csv("text_data.csv")

## 3. Initial inspection

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

## 4. Clean text
Lowercase, Remove URLs, HTML tags, mentions , Remove punctuation and optionally numbers, Normalize whitespace

In [None]:
import re
import string

def clean_text_basic(text, remove_punct=True, remove_numbers=True):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    text = re.sub(r'@\w+', '', text)  # mentions
    text = re.sub(r'<.*?>', '', text)  # html
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    if remove_punct:
        text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
texts_cleaned = df['text'].apply(clean_text_basic)

## 5. Tokenization and Stopword Removal
Splits text into tokens and removes common stopwords to focus on informative words.

In [None]:
# %pip install nltk

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

In [None]:
def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [None]:
texts_tokenized = df['clean_text'].apply(tokenize_and_remove_stopwords)

## 6. Lemmatization / Stemming
Reduces words to their root form to improve generalization and reduce redundancy.


In [None]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
texts_lemmatized = texts_tokenized.apply(lemmatize_tokens)
texts_prepared = texts_lemmatized.apply(lambda x: " ".join(x))

## 6. Exploratory Data Analysis (EDA)

### 6.1 Text Length distribution

In [None]:
df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))
plt.hist(df['text_length'], bins=50)
plt.title('Text Length Distribution')
plt.show()

### 6.2 WordCloud visualization

In [None]:
# %pip install wordcloud

In [None]:
from wordcloud import WordCloud

all_text = " ".join(df['clean_text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## 7. Classical Vectorization
Convert text into numerical vectors using Bag-of-Words or TF-IDF.

### 7.1 Bag-of-Words (CountVectorizer)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_bow(df, max_features=5000):
    vec = CountVectorizer(max_features=max_features)
    X = vec.fit_transform(df['clean_text'].astype(str))
    return X

In [None]:
X_bow = get_bow(df, max_features=4000)

### 7.2 TF-IDF (TfidfVectorizer)

In [None]:
def get_tfidf(df, max_features=5000, ngram_range=(1,1)):
    vec = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    X = vec.fit_transform(df['clean_text'].astype(str))
    return X

In [None]:
X_tfidf = get_tfidf(df, max_features=4000)

# Text Embedding Methods — Short Description and Usage

Required imports for all methods:

```python
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModel
import torch
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)
```

## TF-IDF Embedding
Description: Generates TF-IDF features for text data, capturing term importance; applies PCA to reduce dimensionality. Best for sparse, high-dimensional text data.  
Usage:
```python
def tfidf_embedding(df, column_name, max_features=50, n_components=10, prefix=None):
    """Generate TF-IDF embeddings for a text column with PCA reduction."""
    if prefix is None:
        prefix = column_name
    df = df.copy()
    df[column_name] = df[column_name].fillna('Missing')
    tfidf = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = tfidf.fit_transform(df[column_name]).toarray()
    pca = PCA(n_components=n_components)
    reduced_matrix = pca.fit_transform(tfidf_matrix)
    tfidf_cols = [f'{prefix}_tfidf_{i}' for i in range(n_components)]
    df_tfidf = pd.DataFrame(reduced_matrix, columns=tfidf_cols)
    df = pd.concat([df, df_tfidf], axis=1)
    return df, tfidf, pca

# Example
df, tfidf_model, pca_model = tfidf_embedding(df, 'text_column', max_features=50, n_components=10)
```

## Word2Vec Embedding
Description: Creates word embeddings using Word2Vec, averaging word vectors per text; applies PCA for dimensionality reduction. Suitable for capturing semantic relationships.  
Usage:
```python
def word2vec_embedding(df, column_name, vector_size=50, n_components=10, prefix=None):
    """Generate Word2Vec embeddings for a text column with PCA reduction."""
    if prefix is None:
        prefix = column_name
    df = df.copy()
    df[column_name] = df[column_name].fillna('Missing')
    tokenized = df[column_name].apply(lambda x: word_tokenize(str(x).lower())).tolist()
    w2v_model = Word2Vec(sentences=tokenized, vector_size=vector_size, window=5, min_count=1, workers=4)
    def get_w2v_vector(text):
        tokens = word_tokenize(str(text).lower())
        vectors = [w2v_model.wv[token] for token in tokens if token in w2v_model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)
    w2v_matrix = np.array([get_w2v_vector(text) for text in df[column_name]])
    pca = PCA(n_components=n_components)
    reduced_matrix = pca.fit_transform(w2v_matrix)
    w2v_cols = [f'{prefix}_w2v_{i}' for i in range(n_components)]
    df_w2v = pd.DataFrame(reduced_matrix, columns=w2v_cols)
    df = pd.concat([df, df_w2v], axis=1)
    return df, w2v_model, pca

# Example
df, w2v_model, pca_model = word2vec_embedding(df, 'text_column', vector_size=50, n_components=10)
```

## BERT Embedding
Description: Generates contextual embeddings using BERT (CLS token); applies PCA to reduce dimensionality. Ideal for tasks requiring deep semantic understanding.  
Usage:
```python
def bert_embedding(df, column_name, max_length=128, n_components=20, prefix=None):
    """Generate BERT embeddings for a text column with PCA reduction."""
    if prefix is None:
        prefix = column_name
    df = df.copy()
    df[column_name] = df[column_name].fillna('Missing')
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    bert_model = AutoModel.from_pretrained('bert-base-uncased')
    def get_bert_vector(texts):
        embeddings = []
        for text in texts:
            inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding=True)
            with torch.no_grad():
                outputs = bert_model(**inputs)
            embeddings.append(outputs.last_hidden_state[:, 0, :].numpy())  # CLS token
        return np.vstack(embeddings)
    bert_matrix = get_bert_vector(df[column_name].values)
    pca = PCA(n_components=n_components)
    reduced_matrix = pca.fit_transform(bert_matrix)
    bert_cols = [f'{prefix}_bert_{i}' for i in range(n_components)]
    df_bert = pd.DataFrame(reduced_matrix, columns=bert_cols)
    df = pd.concat([df, df_bert], axis=1)
    return df, tokenizer, bert_model, pca

# Example
df, tokenizer, bert_model, pca_model = bert_embedding(df, 'text_column', max_length=128, n_components=20)
```

### 7.3 Tokenization + Padding (for deep learning)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_tokenized_padded(df, num_words=20000, maxlen=200, oov_token='<OOV>'):
    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(df['clean_text'].astype(str))
    sequences = tokenizer.texts_to_sequences(df['clean_text'].astype(str))
    padded = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
    return padded

In [None]:
X_padded = get_tokenized_padded(df, num_words=10000, maxlen=150)

## 8. Feature Selection
Select top features that are most correlated with the target variable.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif

def select_features_statistical(X, y, method, k):
    selector = SelectKBest(score_func=method, k=k)
    selector.fit_transform(X, y)

    selected_features = X.columns[selector.get_support()]
    print(f"Selected Top {k} Features:")
    print(selected_features)
    return selected_features

In [None]:
X_tfidf_selected = select_features_statistical(X_tfidf, df['label'], method=chi2, k=10)

## 9. Modern Embeddings (Transformer-based)
Generates contextual embeddings using pretrained transformer models like BERT.

In [None]:
# %pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
X_bert = model.encode(texts_prepared, show_progress_bar=True)


In [None]:
## Example:
# glove_matrix = build_embedding_matrix('glove.6B.100d.txt', tokenizer, embedding_dim=100)

## 10. Split Dataset

In [None]:
X = X_tfidf_selected # or X_padded/X_tfidf/X_bow/X_bert
y = df['label']

In [None]:
from sklearn.model_selection import  train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

In [None]:
print("X Train set size:", X_train.shape)
print("X Validation set size:", X_val.shape)
print("X Test set size:", X_test.shape)

print("y Train set size:", y_train.shape)
print("y Validation set size:", y_val.shape)
print("y Test set size:", y_test.shape)


## 11. Model Training (Classical + Modern)
Compare classical TF-IDF + Logistic Regression with BERT embeddings + Logistic Regression.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_selected, labels, test_size=0.2, random_state=42)
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test)
print("TF-IDF Results:\n", classification_report(y_test, y_pred_tfidf))


In [None]:
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_bert, labels, test_size=0.2, random_state=42)
clf_bert = LogisticRegression(max_iter=1000)
clf_bert.fit(X_train_b, y_train_b)
y_pred_bert = clf_bert.predict(X_test_b)
print("BERT Embeddings Results:\n", classification_report(y_test_b, y_pred_bert))
