In [None]:
# Use "base" Kernel for this Notebook

### Step 1: CNN Classification with K-Fold and SMOTE Balancing

This notebook implements a CNN architecture for text classification and compares three experimental setups:
1. **Baseline CNN**: Standard train-test split (Without K-Fold).
2. **K-Fold CNN**: 5-Fold Cross-Validation for a more robust evaluation.
3. **SMOTE + K-Fold CNN**: Balancing the dataset using SMOTE to handle class imbalance before performing 5-Fold Cross-Validation.

Architecture:
* **Tokenizer**: Converts text into sequences.
* **Embedding Layer**: Dense vector representations.
* **Conv1D**: Local pattern detection.
* **GlobalMaxPooling1D**: Feature extraction.

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, LeakyReLU
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from IPython.display import display
import matplotlib.pyplot as plt

# Load Data
df = pd.read_csv('(A) Data/(A) PreProcessed_News Content Title_3000 Data.csv', usecols=['Detokenized', 'Labelling'], engine='python')
df = df.dropna()

# Map labels to 0, 1, 2
label_mapping = {-1: 0, 0: 1, 1: 2}
df['label_encoded'] = df['Labelling'].map(label_mapping)

X = df['Detokenized'].values
y = df['label_encoded'].values

print(f"Data Shape: {df.shape}")
print("Class Distribution:\n", df['label_encoded'].value_counts())

Data Shape: (2791, 3)
Class Distribution:
 label_encoded
1    1331
0    1026
2     434
Name: count, dtype: int64


### 1. Sequence Preprocessing

In [None]:
# Hyperparameters
vocab_size = 5000   # Keep the Top 5,000 Most Frequent Words
embedding_dim = 100
max_length = 100
oov_tok = "<OOV>"   # For Handling Words not in the vocabulary

# Initialize Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X)

# Convert to Sequences and Pad
sequences = tokenizer.texts_to_sequences(X)
padded_X = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

print(f"Found {len(tokenizer.word_index)} unique tokens.")
# "Found 6142 unique tokens, Words ranked 5001 to 6142 are treated as Out of Vocabulary (OOV)"

Found 6142 unique tokens.


### 2. Model Definition
We define architectures for both standard and advanced CNN models.

In [None]:
def create_model():
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(24, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def create_advanced_model():
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        
        # Deeper Dense Network
        Dense(128),
        LeakyReLU(alpha=0.1),
        Dropout(0.4),
        
        Dense(64),
        LeakyReLU(alpha=0.1),
        Dropout(0.4),
        
        Dense(32),
        LeakyReLU(alpha=0.1),
        
        Dense(3, activation='softmax')
    ])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

### 3. Experiment 1: CNN Without K-Fold Validation
Using a standard 80/20 train-test split.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_X, y, test_size=0.2, random_state=42)

print("Training Baseline Model...")
baseline_model = create_model()
baseline_model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test), verbose=1)

y_pred = np.argmax(baseline_model.predict(X_test), axis=1)
baseline_report = classification_report(y_test, y_pred, output_dict=True)
print("\nBaseline Classification Report (Without K-Fold):")
print(classification_report(y_test, y_pred))
baseline_acc = accuracy_score(y_test, y_pred)

### 4. Experiment 2: CNN With K-Fold Validation
Using 5-Fold Cross-Validation on the original imbalanced dataset.

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results_list = []
fold_no = 1

for train, test in kfold.split(padded_X, y):
    print(f'Training fold {fold_no}...')
    model = create_model()
    model.fit(padded_X[train], y[train], epochs=15, batch_size=32, verbose=0)
    
    y_pred = np.argmax(model.predict(padded_X[test]), axis=1)
    report = classification_report(y[test], y_pred, output_dict=True, labels=[0, 1, 2])
    
    results_list.append({
        'Fold': f'Group {fold_no}',
        'Accuracy': accuracy_score(y[test], y_pred),
        'Prec Class 0': report['0']['precision'],
        'Prec Class 1': report['1']['precision'],
        'Prec Class 2': report['2']['precision'],
        'Recall Class 0': report['0']['recall'],
        'Recall Class 1': report['1']['recall'],
        'Recall Class 2': report['2']['recall'],
        'F1 Class 0': report['0']['f1-score'],
        'F1 Class 1': report['1']['f1-score'],
        'F1 Class 2': report['2']['f1-score']
    })
    fold_no += 1

results_df = pd.DataFrame(results_list).set_index('Fold')
final_kfold_table = pd.concat([results_df, pd.DataFrame({'Average': results_df.mean()}).T])

print("\nExperiment 2: K-Fold Results (Imbalanced Data):")
display(final_kfold_table.style.format("{:.4f}"))
exp2_avg_acc = results_df['Accuracy'].mean()

### 5. Experiment 3: Advanced CNN + SMOTE + K-Fold Validation
Applying SMOTE to balance data within each fold using the **Advanced CNN Architecture**.

In [None]:
smote = SMOTE(random_state=42)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
smote_results_list = []
fold_no = 1

for train, test in kfold.split(padded_X, y):
    print(f'Training fold {fold_no} (Advanced Model + SMOTE)...')
    X_train_fold, y_train_fold = padded_X[train], y[train]
    X_train_res, y_train_res = smote.fit_resample(X_train_fold, y_train_fold)
    
    model = create_advanced_model()
    model.fit(X_train_res, y_train_res, epochs=20, batch_size=32, verbose=0)
    
    y_pred = np.argmax(model.predict(padded_X[test]), axis=1)
    report = classification_report(y[test], y_pred, output_dict=True, labels=[0, 1, 2])
    
    smote_results_list.append({
        'Fold': f'Group {fold_no}',
        'Accuracy': accuracy_score(y[test], y_pred),
        'Prec Class 0': report['0']['precision'],
        'Prec Class 1': report['1']['precision'],
        'Prec Class 2': report['2']['precision'],
        'Recall Class 0': report['0']['recall'],
        'Recall Class 1': report['1']['recall'],
        'Recall Class 2': report['2']['recall'],
        'F1 Class 0': report['0']['f1-score'],
        'F1 Class 1': report['1']['f1-score'],
        'F1 Class 2': report['2']['f1-score']
    })
    fold_no += 1

smote_results_df = pd.DataFrame(smote_results_list).set_index('Fold')
final_smote_table = pd.concat([smote_results_df, pd.DataFrame({'Average': smote_results_df.mean()}).T])

print("\nExperiment 3: Advanced CNN + SMOTE + K-Fold Results:")
display(final_smote_table.style.format("{:.4f}"))

### 6. Results Visualization and Comparison

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# --- 1. Overall Accuracy Comparison ---
exp_names = ['Exp 1: Baseline', 'Exp 2: K-Fold Imbalanced', 'Exp 3: SMOTE + K-Fold (Adv)']
accuracies = [
    baseline_acc, 
    results_df['Accuracy'].mean(), 
    smote_results_df['Accuracy'].mean()
]

plt.figure(figsize=(10, 6))
bars = plt.bar(exp_names, accuracies, color=['#3498db', '#e67e22', '#2ecc71'], alpha=0.8)
plt.ylabel('Overall Accuracy', fontsize=12, fontweight='bold')
plt.title('Overall Accuracy Comparison Across Experiments', fontsize=14, fontweight='bold')
plt.ylim(0, 1.0)
plt.grid(axis='y', linestyle='--', alpha=0.7)

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f'{yval:.4f}', ha='center', va='bottom', fontweight='bold')

plt.show()

# --- 2. Class-wise Recall Comparison ---
classes = ['Class 0', 'Class 1', 'Class 2']

exp1_recalls = [baseline_report['0']['recall'], baseline_report['1']['recall'], baseline_report['2']['recall']]
exp2_recalls = [results_df['Recall Class 0'].mean(), results_df['Recall Class 1'].mean(), results_df['Recall Class 2'].mean()]
exp3_recalls = [smote_results_df['Recall Class 0'].mean(), smote_results_df['Recall Class 1'].mean(), smote_results_df['Recall Class 2'].mean()]

x = np.arange(len(classes))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 7))
rects1 = ax.bar(x - width, exp1_recalls, width, label='Exp 1: Baseline', color='#3498db', alpha=0.8)
rects2 = ax.bar(x, exp2_recalls, width, label='Exp 2: K-Fold Imbalanced', color='#e67e22', alpha=0.8)
rects3 = ax.bar(x + width, exp3_recalls, width, label='Exp 3: SMOTE + K-Fold (Adv)', color='#2ecc71', alpha=0.8)

ax.set_ylabel('Recall (Accuracy per Class)', fontsize=12, fontweight='bold')
ax.set_title('Class-wise Recall Comparison', fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(classes, fontsize=11)
ax.legend(loc='upper right', frameon=True, shadow=True)
ax.set_ylim(0, 1.0)
ax.grid(axis='y', linestyle='--', alpha=0.6)

def autolabel_recall(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}', 
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points",
                    ha='center', va='bottom', fontsize=10, fontweight='bold')

autolabel_recall(rects1)
autolabel_recall(rects2)
autolabel_recall(rects3)

plt.tight_layout()
plt.show()