<a href="https://colab.research.google.com/github/MuhammadIrzam447/visionCodes/blob/master/text_ferramenta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = "/content/drive/MyDrive/Colab Notebooks/ferramenta/text-ferramenta-train.csv"
df = pd.read_csv(path)

In [None]:
df.head()

# EDA

In [4]:
# Rename the head of the column
df.columns = ['text']

In [5]:
# Create the Label Column
df['label'] = df['text'].str.split("\|_@", n=1).str[1]
df['text'] = df['text'].str.split("\|_@", n=1).str[0]

In [6]:
# Drop the duplicate and null values
df.drop_duplicates(inplace=True)
df.dropna(inplace = True) 
df.reset_index(drop = True, inplace = True)

In [None]:
df2 = df.copy()

In [None]:
df2.head(30)

In [None]:
df2.shape

In [None]:
df2.describe()

In [None]:
df2['label'].value_counts()

# Cleaning Data

In [None]:
# 32 g/gr/ (32 grams)
# 15 mm (15 milimeter) 
# kg (kilogaram)
# kwb ?
# lb (pounds)
# cm (centimeter)
# sw ?
# oz 
# tm
# p 
# pesso

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
df2['text'] = df2['text'].str.lower()

In [None]:
# import re
# df2['text'] = df2['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', str(x)))
# removing everything except alphabets is drastic as it is making a huge change in input data. how to do it better. 

In [None]:
df2['text'] = df2['text'].apply(word_tokenize, language = "italian")

In [None]:
stop_words = set(stopwords.words('italian'))
df2['text'] = df2['text'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('italian')
df2['text'] = df2['text'].apply(lambda x: [stemmer.stem(word) for word in x])

In [None]:
replacements = {'kg': 'kilogram',
                'mm': 'millimeter', 
                'cm': 'centimeter',
                'gr': 'gram',
                'g': 'gram',
                'lb': 'pounds',
                'oz': 'ounce',
                'mv': 'milivott',
                'm': 'meter',
                'ft': 'feet'}

In [None]:
def replace_words(tokens):
    return [replacements.get(token, token) for token in tokens]

In [None]:
df2['text'] = df2['text'].apply(replace_words)

In [None]:
df2.head(10)

# Top 10 Words in Text

In [None]:
# Joining the words back into a single text
df2['final_text'] = df2['text'].apply(lambda x: ' '.join(x))

In [None]:
import matplotlib.pyplot as plt
import collections
words = []
for text in df2['final_text']:
    words.extend(text.split())
word_count = collections.Counter(words)
top_words = dict(word_count.most_common(20))

# Dark Background
plt.style.use('dark_background')
plt.figure(figsize = (12, 8))

# Create the Barplot
plt.bar(range(len(top_words)), list(top_words.values()), align='center')

plt.xticks(range(len(top_words)), list(top_words.keys()))
# Grid Opacity
plt.grid(alpha = 0.5)
# Title
plt.title('Top 10 most used words', fontsize = 18)
# Labels
plt.xlabel('Words')
plt.ylabel('Frequency')

plt.show()

# Word Embedding using Word2Vec

In [None]:
import gensim
from gensim import models
from gensim.models import Word2Vec

In [None]:
import gensim.downloader as api
info = api.info()  # show info about available models/datasets
model = api.load("word2vec-google-news-300") 

In [None]:
tokenized_data = df['text'].apply(lambda x: x.split())

In [None]:
tokenized_data

In [None]:
import numpy as np
embeddings = np.zeros((len(tokenized_data), model.vector_size))

In [None]:
# Try this one instead
for i, tokens in enumerate(tokenized_data):
    for token in tokens:
        if token in model:
            embeddings[i] += model[token]

In [None]:
embeddings[10]

In [None]:
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

In [None]:
embeddings[10]

# Transform Label using Label Encoder 

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(df2['label'])

In [None]:
# Transform the labels into numerical values
df2['label'] = label_encoder.transform(df2['label'])

In [None]:
df2['label'].value_counts()

# Transform Label using one hot encoding


In [None]:
# using one hot encoding 
df3 = df.copy()

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()

In [None]:
labels_reshaped = df3['label'].values.reshape(-1, 1)
onehot_labels = onehot_encoder.fit_transform(labels_reshaped)
onehot_labels = onehot_labels.toarray()
onehot_df = pd.DataFrame(onehot_labels, columns=onehot_encoder.categories_[0])
df3 = pd.concat([df3, onehot_df], axis=1)


# Fine Tuning Bert Base

In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    
    stop_words = set(stopwords.words('italian'))
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    text = ' '.join(filtered_tokens)
    
    stemmer = SnowballStemmer('italian')
    tokens = text.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    text = ' '.join(stemmed_tokens)
    
    replacements = {'kg': 'kilogram',
                    'mm': 'millimeter',
                    'cm': 'centimeter',
                    'gr': 'gram',
                    'g': 'gram',
                    'lb': 'pounds',
                    'oz': 'ounce',
                    'mv': 'milivott',
                    'm': 'meter',
                    'ft': 'feet'}

    tokens = text.split()
    replaced_tokens = [replacements.get(token, token) for token in tokens]
    text = ' '.join(replaced_tokens)
    
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# df['text'] = df['text'].apply(preprocess_text)

In [10]:
num_labels = df['label'].nunique()
print(num_labels)

52


In [11]:
df = pd.get_dummies(df, columns=['label'])

In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
df.columns

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class FragmentaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label_columns = self.data.columns[1:]  # Exclude the 'text' column
        label = self.data.iloc[index][label_columns].values.astype(float)  # Convert labels to float
        label_tensor = torch.tensor(label)  # Convert labels to tensor

        # Preprocess text
        processed_text = preprocess_text(text)

        # Tokenize and convert text to input tensors
        inputs = self.tokenizer.encode_plus(
            processed_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return input_ids, attention_mask, label_tensor

In [15]:
# Define the maximum sequence length for padding/truncation
max_length = 128

# Create an instance of the custom dataset
dataset = FragmentaDataset(df, tokenizer, max_length)

# Create a data loader
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size)

In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [17]:
num_epochs = 15
training_loss = []

In [None]:
# num_labels = df.shape[1]
# print(num_labels)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
training_loss = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(data_loader)
    training_loss.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

Epoch 1/15, Average Loss: 1.4184
Epoch 2/15, Average Loss: 0.7771
Epoch 3/15, Average Loss: 1.0466
Epoch 4/15, Average Loss: 0.8920
Epoch 5/15, Average Loss: 0.8550
Epoch 6/15, Average Loss: 0.9351
Epoch 7/15, Average Loss: 0.9061
Epoch 8/15, Average Loss: 0.8481
Epoch 9/15, Average Loss: 0.8604
Epoch 10/15, Average Loss: 0.9363
Epoch 11/15, Average Loss: 1.1740


In [None]:
# Save the fine-tuned model
model.save_pretrained("BertForClassification")

# Validation

In [None]:
val_path = "/content/drive/MyDrive/Colab Notebooks/ferramenta/text-ferramenta-val.csv"

In [None]:
valdf = pd.read_csv(val_path)

In [None]:
# Rename the head of the column
valdf.columns = ['text']

In [None]:
# Create the Label Column
valdf['label'] = valdf['text'].str.split("\|_@", n=1).str[1]
valdf['text'] = valdf['text'].str.split("\|_@", n=1).str[0]

In [None]:
# Drop the duplicate and null values
valdf.drop_duplicates(inplace=True)
valdf.dropna(inplace = True) 
valdf.reset_index(drop = True, inplace = True)

In [None]:
val_num_labels = valdf['label'].nunique()
print(val_num_labels)

In [None]:
valdf = pd.get_dummies(valdf, columns=['label'])

In [None]:
# Define the maximum sequence length for padding/truncation
max_length = 128

# Create an instance of the custom dataset
val_dataset = FragmentaDataset(valdf, tokenizer, max_length)

# Create a data loader
batch_size = 32
val_data_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

predictions = []
targets = []
val_loss = 0

with torch.no_grad():
    for batch in val_data_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()

        _, predicted_labels = torch.max(outputs.logits, dim=1)

        predictions.extend(predicted_labels.cpu().tolist())
        targets.extend(labels.cpu().tolist())

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

avg_val_loss = val_loss / len(val_data_loader)
accuracy = accuracy_score(targets, predictions)
precision = precision_score(targets, predictions, average='weighted')
recall = recall_score(targets, predictions, average='weighted')
f1 = f1_score(targets, predictions, average='weighted')

print(f"Validation Loss: {avg_val_loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
from sklearn.metrics import classification_report

# Calculate evaluation metrics
accuracy = accuracy_score(targets, predictions)
report = classification_report(targets, predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

In [None]:
# Define the labels for the metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

# Define the values for each metric
values = [accuracy, precision, recall, f1_score]

fig, ax = plt.subplots()
bars = ax.bar(metrics, values)
plt.ylim([0, 1])  # Set the y-axis limit to range from 0 to 1
plt.ylabel('Metric Value')
plt.title('Evaluation Metrics')

# Add the metric scores on top of each bar
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, yval, round(yval, 4), ha='center', va='bottom')

plt.show()