<a href="https://colab.research.google.com/github/KSaiNihal/Text-Classification-20_News_Groups-/blob/main/20_News_Groups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import os
import requests
import tarfile
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, stem_text
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# URL to the dataset
url = "http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz"

# Download the dataset
response = requests.get(url)
with open("20news-19997.tar.gz", "wb") as f:
    f.write(response.content)

# Extract the dataset
with tarfile.open("20news-19997.tar.gz", "r:gz") as tar:
    tar.extractall()

In [None]:
data = []
target = []

# Adjust the directory path according to where your dataset is located
dataset_dir = "/content/20news-bydate-train"  # Update this path

for category in os.listdir(dataset_dir):
    category_path = os.path.join(dataset_dir, category)
    if os.path.isdir(category_path):
        for document in os.listdir(category_path):
            document_path = os.path.join(category_path, document)
            with open(document_path, "r", errors="ignore") as f:
                data.append(f.read())
            target.append(category)

# Create a DataFrame using pandas
df = pd.DataFrame({'text': data, 'target': target})

# Print the first few rows to verify
print(df.head())


                                                text              target
0   egsner!ernest!m2.dseg.ti.com!tilde.csc.ti.com...  rec.sport.baseball
1  From: rachford@en.ecn.purdue.edu (Jeffery M Ra...  rec.sport.baseball
2  From: jtchern@ocf.berkeley.edu (Joseph Hernand...  rec.sport.baseball
3  From: gspira@nyx.cs.du.edu (Greg Spira)\nSubje...  rec.sport.baseball
4  From: klopfens@andy.bgsu.edu (Bruce Klopfenste...  rec.sport.baseball


In [None]:
df[1:2]

Unnamed: 0,text,target
1,From: henry@zoo.toronto.edu (Henry Spencer)\nS...,sci.space


Checking missing values

In [None]:
# Check for any missing values
missing_values = df.isnull().sum()

# Print the count of missing values
print(f"Missing values:\n{missing_values}")

# Optionally, handle missing values if present
if missing_values.any():
    # Example: Fill missing values with an empty string
    df.fillna("", inplace=True)
    print("Missing values handled.")

# Confirm if there are any remaining missing values
print(f"After handling missing values:\n{df.isnull().sum()}")


Missing values:
text      0
target    0
dtype: int64
After handling missing values:
text      0
target    0
dtype: int64


Checking Dulicates

In [None]:
# Check for duplicates based on the 'text' column
duplicates = df.duplicated(subset=['text'])

# Count the number of duplicates
num_duplicates = duplicates.sum()
print(f"Number of duplicate rows: {num_duplicates}")

# Optionally, display the duplicate rows
duplicate_rows = df[duplicates]
print("Duplicate rows:")
print(duplicate_rows)


Number of duplicate rows: 0
Duplicate rows:
Empty DataFrame
Columns: [text, target]
Index: []


Lower casing and Removing numbers, puntuation, tags etc

In [None]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
# df = pd.read_csv('20_newsgroups_combined.csv')

# Preprocessing functions
def preprocess_text(text):
    # Gensim preprocessing
    filters = [
        lambda x: x.lower(),           # Convert to lowercase
        strip_tags,                    # Remove HTML tags
        strip_numeric,                 # Remove numbers
        strip_punctuation,             # Remove punctuation
        strip_multiple_whitespaces     # Remove extra whitespaces
    ]
    text = ' '.join(preprocess_string(text, filters=filters))

    # NLTK preprocessing
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Apply preprocessing steps to the 'text' column
df['text'] = df['text'].apply(preprocess_text)

# Save the preprocessed dataset
# df.to_csv('preprocessed_20_newsgroups_gensim.csv', index=False)

# Print the result
print(df['text'].iloc[0])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


nsmca aurora alaska edu subject eco freaks forcing space mining article aurora apr organization university alaska fairbanks lines nntp posting host acad alaska edu article prb access digex com pat writes article nsmca aurora alaska edu writes article prb access digex com pat writes besides line horse puckey mining companies claimed told pay restoring land strip mining aint talking large even mining companies talking small miners people employees people go every year set thier sluice box mining semi old fashion way okay use modern methods toa point lot small miners longer miners people living rent free federal land claim miner facts many people sustaint heir income mining often even live full time fotentimes fair bit environmental damage minign statutes created inthe west uninhabited designed bring people frontier times change people change deal constitutional right live industry forever anyone claims right job particular spouting nonsense long term federal welfare program outlived usef

Stop Words removal

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Function for text preprocessing
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Join tokens back into a single string
    preprocessed_text = ' '.join(filtered_tokens)

    return preprocessed_text

# Apply preprocessing to the 'text' column
df['text'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df['target'].value_counts()

target
rec.sport.hockey            600
soc.religion.christian      599
rec.motorcycles             598
rec.sport.baseball          597
sci.crypt                   595
rec.autos                   594
sci.med                     594
sci.space                   593
comp.windows.x              593
comp.os.ms-windows.misc     591
sci.electronics             591
comp.sys.ibm.pc.hardware    590
misc.forsale                585
comp.graphics               584
comp.sys.mac.hardware       578
talk.politics.mideast       564
talk.politics.guns          546
alt.atheism                 480
talk.politics.misc          465
talk.religion.misc          377
Name: count, dtype: int64

Stemming

In [None]:
# Initialize the stemmer
stemmer = PorterStemmer()

# Preprocessing function with stemming
def preprocess_and_stem(text):
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Apply stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    return ' '.join(stemmed_words)

# Load the dataset
# df = pd.read_csv('20_newsgroups_combined.csv')

# Apply preprocessing and stemming to the 'text' column
df['text'] = df['text'].apply(preprocess_and_stem)

# Display the first few rows of the dataframe to see the changes
print(df.head())


                                                text     target
0  nsmca aurora alaska edu subject eco freak forc...  sci.space
1  henri zoo toronto edu henri spencer subject bi...  sci.space
2  baalk kelvin jpl nasa gov ron baalk subject ma...  sci.space
3  subject quotat lowest bidder bioccnt otago ac ...  sci.space
4  jmcocker eo ncsu edu mitch subject wrench work...  sci.space


Label Encoding

In [None]:
# Assuming df is your DataFrame and 'target' is your target column
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Fit and transform the target column
df['target'] = label_encoder.fit_transform(df['target'])

# Display the mapping of classes to their encoded labels
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Class Mapping:")
for class_name, encoded_label in class_mapping.items():
    print(f"{class_name}: {encoded_label}")

# Display the DataFrame with the new encoded target column
print(df.head())


Class Mapping:
alt.atheism: 0
comp.graphics: 1
comp.os.ms-windows.misc: 2
comp.sys.ibm.pc.hardware: 3
comp.sys.mac.hardware: 4
comp.windows.x: 5
misc.forsale: 6
rec.autos: 7
rec.motorcycles: 8
rec.sport.baseball: 9
rec.sport.hockey: 10
sci.crypt: 11
sci.electronics: 12
sci.med: 13
sci.space: 14
soc.religion.christian: 15
talk.politics.guns: 16
talk.politics.mideast: 17
talk.politics.misc: 18
talk.religion.misc: 19
                                                text  target
0  nsmca aurora alaska edu subject eco freak forc...      14
1  henri zoo toronto edu henri spencer subject bi...      14
2  baalk kelvin jpl nasa gov ron baalk subject ma...      14
3  subject quotat lowest bidder bioccnt otago ac ...      14
4  jmcocker eo ncsu edu mitch subject wrench work...      14


###Model Creation

In [None]:
# # Example of TF-IDF Vectorization
# vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
# X = vectorizer.fit_transform(df['text'])

# # Optional: Print the vocabulary size
# print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# # Example: Print the shape of the vectorized data
# print(f"Shape of X: {X.shape}")


In [None]:
# pip install tensorflow tensorflow-hub

Sentence Embedding

In [None]:
import tensorflow_hub as hub
import pandas as pd

# Load Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
# Assume df['preprocessed_text'] contains your preprocessed text data

# Function to convert text to embeddings
def embed_text(text):
    return embed([text])[0].numpy()

# Example usage: Transform your text data to embeddings
embeddings = df['text'].apply(embed_text)

# Check the shape of the embeddings
print(f"Shape of embeddings: {embeddings.shape}")


Shape of embeddings: (11314,)


In [None]:
X=embeddings
y=df['target']

Train-Test-Split

In [None]:
from sklearn.model_selection import train_test_split
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ANN(Artificial Neural Network)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import RandomNormal

# Example data conversion (modify this according to your actual data structure)
# Assuming X_train and X_test are originally lists of sequences or pandas DataFrames

# Ensure the data is in the correct format (2D NumPy arrays)
X_train = np.array([np.array(x) for x in X_train])
X_test = np.array([np.array(x) for x in X_test])

# Convert data types to float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Convert NumPy arrays to TensorFlow Tensors
X_train_tf = tf.convert_to_tensor(X_train, dtype=tf.float32)
X_test_tf = tf.convert_to_tensor(X_test, dtype=tf.float32)

# Building a simpler ANN model with reduced complexity
model = Sequential()
model.add(Dense(256, activation='relu', kernel_initializer=RandomNormal(mean=0.0, stddev=0.1)))
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(128, activation='relu', kernel_initializer=RandomNormal(mean=0.0, stddev=0.1)))
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(len(set(y_train)), activation='softmax', kernel_initializer=RandomNormal(mean=0.0, stddev=0.1)))

# Compiling the model with Adam optimizer
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Training the simplified model with early stopping
history = model.fit(X_train_tf, y_train, epochs=50, batch_size=64,
                    validation_data=(X_test_tf, y_test), verbose=1, callbacks=[early_stop])

# Evaluating the model on training data
train_loss, train_accuracy = model.evaluate(X_train_tf, y_train, verbose=0)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Evaluating the model on testing data
test_loss, test_accuracy = model.evaluate(X_test_tf, y_test, verbose=0)
print(f"Testing Accuracy: {test_accuracy:.4f}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Training Accuracy: 0.8885
Testing Accuracy: 0.7499


Ensemble Learning

In [None]:
#final model
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping

# Ensure the data is in the correct format (2D NumPy arrays)
X_train = np.array([np.array(x) for x in X_train])
X_test = np.array([np.array(x) for x in X_test])

# Convert data types to float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Splitting training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Different architectures with reduced complexity
def create_model_1():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],),
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu',
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.add(Dense(len(set(y_train)), activation='softmax',
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def create_model_2():
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],),
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu',
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.add(Dense(len(set(y_train)), activation='softmax',
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def create_model_3():
    model = Sequential()
    model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],),
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu',
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.add(Dense(len(set(y_train)), activation='softmax',
                    kernel_initializer=RandomNormal(mean=0.0, stddev=0.1),
                    kernel_regularizer=l2(0.001)))  # L2 regularization
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the models with reduced complexity
models = []
for create_model in [create_model_1, create_model_2, create_model_3]:
    model = create_model()
    model.fit(X_train_split, y_train_split, epochs=50, batch_size=64,
              validation_data=(X_val_split, y_val_split), verbose=1,
              callbacks=[early_stopping])
    models.append(model)

# Collect predictions
train_meta_features = np.zeros((X_train.shape[0], len(models) * len(set(y_train))))
test_meta_features = np.zeros((X_test.shape[0], len(models) * len(set(y_train))))

for i, model in enumerate(models):
    train_meta_features[:, i * len(set(y_train)):(i + 1) * len(set(y_train))] = model.predict(X_train, verbose=0)
    test_meta_features[:, i * len(set(y_train)):(i + 1) * len(set(y_train))] = model.predict(X_test, verbose=0)

# Standardize meta-features
scaler = StandardScaler()
train_meta_features = scaler.fit_transform(train_meta_features)
test_meta_features = scaler.transform(test_meta_features)

# Use a meta-learner for stacking
meta_learner = LogisticRegression(max_iter=1000)
meta_learner.fit(train_meta_features, y_train)

# Make final predictions
final_predictions = meta_learner.predict(test_meta_features)

# Calculate and print the accuracy
ensemble_accuracy = accuracy_score(y_test, final_predictions)
print(f"Stacking Ensemble Testing Accuracy: {ensemble_accuracy:.4f}")

# Evaluate each model on training data
for i, model in enumerate(models):
    train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
    print(f"Model {i+1} Training Accuracy: {train_accuracy:.4f}")

# Evaluate each model on testing data
for i, model in enumerate(models):
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Model {i+1} Testing Accuracy: {test_accuracy:.4f}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5