TASK 1 -  DATA STRUCTURING

In [6]:
import os
import pandas as pd

# Specify the directory containing the BBC_articles folder
directory = 'BBC_articles'

# Create an empty list to store the data
data = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        # Extract article_id and category from the filename
        article_id, category = filename.split('_')
        
        # Read the text content of the file
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Append the data to the list
        data.append({'article_id': article_id, 'text': text, 'category': category})

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('bbc_articles.csv', index=False)

In [7]:
import pandas as pd

df = pd.read_csv('bbc_articles.csv')

TASK 2 -Data Preprocessing for Model Training

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenize the text data
df['tokenized_text'] = df['text'].apply(word_tokenize)

# Define a function to preprocess the tokens
def preprocess_tokens(tokens):
    preprocessed_tokens = []
    for token in tokens:
        # Remove numerals
        token = re.sub(r'\d+', '', token)
        
        # Convert to lowercase and remove punctuation
        token = ''.join([c for c in token if c not in string.punctuation])
        token = token.lower()
        
        # Lemmatize
        lemmatized_token = lemmatizer.lemmatize(token)
        
        preprocessed_tokens.append(lemmatized_token)
    
    return preprocessed_tokens

# Apply the preprocessing function to the tokenized text
df['preprocessed_text'] = df['tokenized_text'].apply(preprocess_tokens)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer with a maximum of 10,000 features
vectorizer = TfidfVectorizer(max_features=10000)

# Fit and transform the preprocessed text data
X = vectorizer.fit_transform(df['preprocessed_text'].apply(' '.join))

# Create a new DataFrame with the numerical features and labels
vectorized_data = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
vectorized_data['category'] = df['category']

# Save the vectorized dataset as a new CSV file with limited floating-point precision
vectorized_data.to_csv('vectorized_dataset.csv', index=False, float_format='%.4f')

In [11]:
import pandas as pd

# Save the vectorized dataset as a compressed CSV file
vectorized_data.to_csv('vectorized_dataset.csv.gz', index=False, float_format='%.4f', compression='gzip')