<a href="https://colab.research.google.com/github/GifilGeorge/BERT/blob/master/Prediction_Farsi_TextPreProcessing_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

### Libraries

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import warnings
warnings.filterwarnings("ignore")

### Loading the dataset

In [None]:
df = pd.read_csv("sample_data/twitterdata.csv")

In [None]:
df_tweets = df[['tweets','likes','retweets']]

In [None]:
df_tweets.shape

(23530, 3)

In [None]:
df_tweets['tweets'] = df_tweets['tweets'].astype(str)

## Data Preprocessing

Both cased and uncased version of BERT and tokenizer are available. But the cased version works better.

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-multilingual-cased'
#'HooshvareLab/bert-base-parsbert-uncased'

###  Normal Tokens

In [None]:
## Loading the pretrained tokenizer
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




### Create a PyTorch dataset

In [None]:
class TextDataset(Dataset):
    def __init__(self, reviews, tokenizer, max_len):
        self.reviews = reviews
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.reviews)
    def __getitem__(self, item):
        review = str(self.reviews[item])
        encoding = self.tokenizer.encode_plus(
                      review,
                      add_special_tokens=True,
                      max_length=self.max_len,
                      return_token_type_ids=False,
                      pad_to_max_length=True,
                      return_attention_mask=True,
                      return_tensors='pt',
                        )
        return {
          'review_text': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
                }

##### Choosing Sequence Length

In [None]:
token_lens = []
for txt in df_tweets.tweets.values:
    tokens = tokenizer.encode(str(txt), max_length=1024)
    token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');

In [None]:
MAX_LEN = 75

### Create data loaders

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    
    ds = TextDataset(reviews=df.tweets.to_numpy(),
                         tokenizer=tokenizer,
                         max_len=max_len
                        )
    return DataLoader(ds,
                      batch_size=batch_size,
                      num_workers=4  ## used for loading the data
                     )

In [None]:
BATCH_SIZE = 16

Creating train,validation and test data

In [None]:
new_data_loader = create_data_loader(df, tokenizer, MAX_LEN, BATCH_SIZE)

### Building Sentiment Classfier

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids,          ## _, => skiping the last hidden state
                                     attention_mask=attention_mask
                                    )
        output = self.drop(pooled_output)  ## getting the output from droput layer 
        output = self.out(output)          ## applying the ouput layer classifier
        return self.softmax(output)        ## returning the output of softmax function


#### Loading the Saved model

In [None]:
Num_class = 3 ## Number of class to be predicted
model = SentimentClassifier(Num_class)
# model.load_state_dict(torch.load('best_model_state.bin'))
# model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    review_texts = []
    predictions = []
    prediction_probs = []
    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            #input_ids = d["input_ids"]
            attention_mask = d["attention_mask"].to(device)
            #attention_mask = d["attention_mask"]
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask
                           )
            _, preds = torch.max(outputs, dim=1)
            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)  

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    return review_texts, predictions, prediction_probs

In [None]:
y_review_texts, y_pred, y_pred_probs = get_predictions(model,new_data_loader)

## Predicting the Raw Text

In [None]:
review_text = "'همراه اول کارش مثل پشس خون مردمو داره میخوره با بسته هاش :)))'"


In [None]:
encoded_review = tokenizer.encode_plus(review_text,
                                       max_length=MAX_LEN,
                                       add_special_tokens=True,
                                       return_token_type_ids=False,
                                       pad_to_max_length=True,
                                       return_attention_mask=True,
                                       return_tensors='pt',
                                      )

In [None]:
#input_ids = encoded_review['input_ids'].to(device)
input_ids = encoded_review['input_ids']

#attention_mask = encoded_review['attention_mask'].to(device)
attention_mask = encoded_review['attention_mask']

output = model(input_ids, attention_mask)

_, prediction = torch.max(output, dim=1)



print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')

Review text: 'همراه اول کارش مثل پشس خون مردمو داره میخوره با بسته هاش :)))'
Sentiment  : negative


In [None]:
df.tweets[10]

'همراه اول کارش مثل پشس خون مردمو داره میخوره با بسته هاش :)))'