In [None]:
# Standard libraries
import json
import re
import warnings
warnings.filterwarnings("ignore")

# Third-party libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
from transformers import BertTokenizer

# Configuration
sns.set_theme()

## **Exploratory Data Analysis (EDA)**

In [None]:
# Data ingestion
data = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/dataset.csv"
df = pd.read_csv(data)
df.head()

In [None]:
print(
    f"{df.description.nunique() = }",
    f"\n{df.tag.nunique() = }",
    f"\n{df.tag.value_counts()}",
    f"\n{df.created_on.nunique() = }",
)

In [None]:
# Splitting data into training

## Setting aside 20% of data to later be split between the test and validation
## sets
rem_size = 0.2
test_size = 0.5

train_df, rem_df = train_test_split(df, test_size=rem_size, random_state=0, stratify=df.tag)


# Splitting the rem_df to create our test and validation sets
val_df, test_df = train_test_split(rem_df, test_size=test_size, random_state=0, stratify=rem_df.tag)

In [None]:
print(
    f"Shape of train_df {train_df.shape}"
    f"\nShape of val_df {val_df.shape}"
    f"\nShape of test_df {test_df.shape}"
)

In [None]:
print(
    "Count of 'Tags':"
    f"\n{df.tag.value_counts()}",
)

In [None]:
# Get the tag counts
tag_counts = df.tag.value_counts()

# Separate values and tags
tags = tag_counts.index
tag_values = tag_counts.values

In [None]:
# Plot tag frequencies
plt.figure(figsize=(10, 3))
ax = sns.barplot(x=list(tags), y=list(tag_values))
ax.set_xticklabels(tags, rotation=0, fontsize=8)
plt.title("Tag distribution", fontsize=14)
plt.ylabel("# of projects", fontsize=12)
plt.show()

In [None]:
# Most frequent tokens for each tag
tag="natural-language-processing"
plt.figure(figsize=(10, 3))
subset = df[df.tag==tag]
text = subset.title.values
cloud = WordCloud(
    stopwords=STOPWORDS, background_color="black", collocations=False,
    width=500, height=300).generate(" ".join(text))
plt.axis("off")
plt.imshow(cloud)

## **Data Preprocessing**

In [None]:
# Input
df['text'] = df.title + " " + df.description

df

In [None]:
def clean_text(text, stopwords=STOPWORDS):
    """Clean raw text string."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub('', text)

    # Spacing and filters
    text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text)  # add spacing
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends
    text = re.sub(r"http\S+", "", text)  #  remove links

    return text

In [None]:
# Apply clean_text function to dataframe
original_df = df.copy()
df.text = df.text.apply(clean_text)

In [None]:
print (f"{original_df.text.values[2]}\n{df.text.values[2]}")

In [None]:
# DataFrame cleanup
cleaned_df = df[["text", "tag"]]
cleaned_df.shape

In [None]:
cleaned_df = cleaned_df.dropna(subset=["tag"])
cleaned_df.shape

In [None]:
# Assiging LabelEncoder to le variable to later be used
le = LabelEncoder()

# Fit label encoder and transform the 'tag' column
cleaned_df['tag'] = le.fit_transform(cleaned_df['tag'])

cleaned_df.head(5)

In [None]:
# Create a dictionary mapping unique values to their encoded numbers
label_encoder_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Print the mapping of unique values to their encoded numbers
print("Label Encoding Mapping:")
for tag, encoded_number in label_encoder_mapping.items():
    print(f"{tag}: {encoded_number}")

In [None]:
# This function will be used to revert label encoded values back to their original text
# that will allow for users to understand the output
def decode_labels(df: pd.DataFrame, column_name: str, label_encoder: LabelEncoder) -> pd.DataFrame:
    df[column_name + '_decoded'] = label_encoder.inverse_transform(df[column_name])
    return df

# Decode label-encoded values
df_decoded = decode_labels(cleaned_df, 'tag', le)

In [None]:
# Bert tokenizer
# This line initializes a BERT tokenizer from the Hugging Face library.
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)

# The input text to be tokenized
text = "Transfer learning with transformers for text classification."

# Tokenize the input text
# This line tokenizes the input text using the BERT tokenizer.
# It returns a dictionary of encoded inputs, including the input IDs and attention mask.
# The input text is passed as a list to handle batching, and "np" indicates that the return type should be numpy arrays.
# "padding" parameter is set to "longest" to pad sequences to the length of the longest sequence in the batch.
encoded_inputs = tokenizer([text], return_tensors="np", padding="longest")

# Print input IDs
print ("input_ids:", encoded_inputs["input_ids"])

# Print attention mask
print ("attention_mask:", encoded_inputs["attention_mask"])

# Decode the input IDs
print (tokenizer.decode(encoded_inputs["input_ids"][0]))

In [None]:
def tokenize(batch):
    tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
    encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
    return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))

In [None]:
# Tokenization
tokenize(cleaned_df.head(1))

In [None]:
# @article{madewithml,
#     author       = {Goku Mohandas},
#     title        = { Preprocessing - Made With ML },
#     howpublished = {\url{https://madewithml.com/}},
#     year         = {2023}
# }