In [60]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
data = pd.read_csv('twitter_sentiment.csv')
data.head()

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [61]:
data.sentiment.value_counts()

sentiment
Negative    22624
Positive    20932
Name: count, dtype: int64

In [62]:
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x == 'Positive' else 0)

In [63]:
# remove stopwords, punctuation, convert to lower case, lemmatize, remove numbers, remove urls, remove mentions, remove hashtags, remove emojis, remove extra spaces

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
tokenizer = TweetTokenizer()

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def to_lower(text):
    return text.lower()

def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def stem(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

def remove_emojis(text):
    return text.encode('ascii', 'ignore').decode('ascii')

def remove_extra_spaces(text):
    return ' '.join(text.split())

def preprocess_minimal(text):
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_extra_spaces(text)
    return text

data['text'] = data['text'].apply(preprocess_minimal)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\K7alid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\K7alid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\K7alid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [64]:
# replace im with i am, dont with do not
data['text'] = data['text'].apply(lambda x: x.replace("im", "i am"))
data['text'] = data['text'].apply(lambda x: x.replace("dont", "do not"))

data.head()

Unnamed: 0,sentiment,text
0,1,i am getting on borderlands and i will murder ...
1,1,I am coming to the borders and I will kill you...
2,1,i am getting on borderlands and i will kill yo...
3,1,i am coming on borderlands and i will murder y...
4,1,i am getting on borderlands 2 and i will murde...


In [65]:
data.shape

(43556, 2)

In [66]:
data = data[data['text'].apply(lambda x: len(x.split())) <= 15]

In [67]:
data.shape

(22787, 2)

In [68]:
data.sentiment.value_counts()

sentiment
1    11849
0    10938
Name: count, dtype: int64

In [69]:
data.to_csv('twitter_sentiment_cleaned.csv', index=False)

In [70]:
from datasets import load_dataset

raw_dataset = load_dataset('csv', data_files='twitter_sentiment_cleaned.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [71]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'text'],
        num_rows: 22787
    })
})

In [72]:
split = raw_dataset['train'].train_test_split(test_size=0.25, seed=42)
split

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'text'],
        num_rows: 17090
    })
    test: Dataset({
        features: ['sentiment', 'text'],
        num_rows: 5697
    })
})

In [73]:
check_point = 'distilbert-base-uncased'

In [74]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(check_point)


def tokenize_function(examples):
    return tokenizer([str(text) for text in examples['text']], truncation=True)

tokenized_datasets = split.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.rename_column("sentiment", "labels")

Map:   0%|          | 0/17090 [00:00<?, ? examples/s]

Map:   0%|          | 0/5697 [00:00<?, ? examples/s]

In [75]:
tokenizer.vocab_size

30522

In [76]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 17090
    })
    test: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 5697
    })
})

In [77]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

config = AutoConfig.from_pretrained(check_point, num_labels=2)

In [78]:
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.40.1",
  "vocab_size": 30522
}

In [79]:
config.label2id

{'LABEL_0': 0, 'LABEL_1': 1}

In [80]:
config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [81]:
config.id2label = {v: k for k, v in {'Negative': 0, 'Positive': 1}.items()}
config.label2id = {'Negative': 0, 'Positive': 1}

In [82]:
config.id2label

{0: 'Negative', 1: 'Positive'}

In [83]:
config.label2id

{'Negative': 0, 'Positive': 1}

In [84]:
model = AutoModelForSequenceClassification.from_pretrained(check_point, config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [85]:
from torchinfo import summary

# summary(model, input_size=(16,512), dtypes=[ 'torch.IntTensor'], device='cpu')
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [86]:
training_args = TrainingArguments(
    output_dir="training_dir",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
)

In [87]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average='macro')
    return {'accuracy': acc, 'f1': f1}

In [88]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [89]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3309,0.301016,0.901527,0.901154
2,0.1693,0.286506,0.924522,0.924278
3,0.0897,0.29535,0.929437,0.929254


TrainOutput(global_step=6411, training_loss=0.2142057197176895, metrics={'train_runtime': 586.4224, 'train_samples_per_second': 87.428, 'train_steps_per_second': 10.932, 'total_flos': 381675831983808.0, 'train_loss': 0.2142057197176895, 'epoch': 3.0})

In [90]:
from transformers import pipeline

In [94]:
saved_model = pipeline('text-classification', model='training_dir/checkpoint-2137', device=0)

In [95]:
split['test']

Dataset({
    features: ['sentiment', 'text'],
    num_rows: 5697
})

In [99]:
sentence = "I love this product!"
print(split['test']['text'][2])
prediction = saved_model(split['test']['text'][2])

print(prediction)


The menu committee this year is atrocious.
[{'label': 'Negative', 'score': 0.9556489586830139}]


In [100]:
!git init

Initialized empty Git repository in K:/Mine/Â /Artificial Intelligence/NLP/Projects/sentiment analyser/.git/


In [None]:
!git add .