<a href="https://colab.research.google.com/github/Mohammadhsiavash/Sentiment-analysis-with-NLP/blob/main/notebooks/inferentia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install bertviz
!pip install datasets
!pip install transformers[torch]
!pip install accelerate
!pip install huggingface_hub
!pip install sentencepiece
!pip install shap
!pip install lime

In [None]:
from huggingface_hub  import login
login()

In [None]:
from matplotlib import pyplot as plt
import spacy
from wordcloud import WordCloud

from transformers import AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
import numpy as np
import torch


def create_word_cloud(series):
    # Combine all text from the series into a single string
    text = ' '.join(series.tolist())

    # Load spaCy's English language model
    nlp = spacy.load('en_core_web_sm')

    # Create a set of spaCy's English stop words
    stopwords = set(nlp.Defaults.stop_words)

    # Step 4: Create an instance of WordCloud and generate the word cloud
    wordcloud = WordCloud(stopwords=stopwords).generate(text)

    # Step 6: Display the word cloud using matplotlib
    plt.figure(figsize=(8, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

import re

def remove_mental_health_references(df):
  unique_labels = list(df.subreddit.unique())
  unique_labels.extend(list(map(lambda x: x.lower(), unique_labels)))
  unique_labels.extend(list(map(lambda x: x.upper(), unique_labels)))
  unique_labels = set(unique_labels)
  pattern = '|'.join(unique_labels)
  df.body = df.body.str.replace(pattern, '', regex=True)
  return df


def not_none(example):
    return example['body'] is not None

def prepare_dataframe(df):
  # Concatenate title and body
  df['body'] = df.body.fillna('')
  df['body'] = df.body.str.cat(df.title, sep=' ')

  # Removed deleted posts
  df = df[~df.author.str.contains('\[deleted\]')]
  df = df[~df.body.str.contains('\[removed\]')]
  df = df[~df.body.str.contains('\[deleted\]')]
  df = df[~df.body.str.contains('\[deleted by user\]')]

  # Removed moderador posts
  df = df[df.author!='AutoModerator']

  return df[['body', 'subreddit']]


In [None]:
dataset = load_dataset("solomonk/reddit_mental_health_posts")
df = dataset["train"].to_pandas()
df = prepare_dataframe(df).reset_index(drop=True)
df = df.pipe(remove_mental_health_references)
dataset = Dataset.from_pandas(df)
dataset_sampled = dataset.train_test_split(test_size=0.7, seed=42)['train']

train_val_test = dataset_sampled.train_test_split(test_size=0.2, seed=42)
train_dataset = train_val_test['train']
test_val_dataset = train_val_test['test']

test_val_split = test_val_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = test_val_split['train']
test_dataset = test_val_split['test']

In [None]:
le = LabelEncoder()
le.fit(dataset_sampled['subreddit'])

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

# This is the inferential of the model

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AutoModelForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('fzetter/roberta-mental-health')

In [None]:
!pip install transformers-interpret

In [None]:
from transformers_interpret import SequenceClassificationExplainer
cls_explainer = SequenceClassificationExplainer(
    model,
    tokenizer,
    custom_labels =  le.classes_)
word_attributions = cls_explainer(test_dataset["body"][1017], class_name="ADHD")

In [None]:
cls_explainer.visualize("distilbert_viz.html")

In [None]:
word_attributions = cls_explainer(test_dataset["body"][921], class_name="ADHD")
cls_explainer.visualize("distilbert_viz.html")

In [None]:
word_attributions = cls_explainer(test_dataset["body"][1250], class_name="ADHD")
cls_explainer.visualize("distilbert_viz.html")

In [None]:
word_attributions = cls_explainer(test_dataset["body"][762], class_name="aspergers")
cls_explainer.visualize("distilbert_viz.html")

In [None]:
word_attributions = cls_explainer(test_dataset["body"][925], class_name="ADHD")
cls_explainer.visualize("distilbert_viz.html")

In [None]:
word_attributions = cls_explainer(test_dataset["body"][1652	], class_name="ptsd")
cls_explainer.visualize("distilbert_viz.html")