## Read the dataset csv file

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re

ModuleNotFoundError: No module named 'pandas'

In [None]:
!pip install pandas
!pip install sklearn


In [None]:
# Load Dataset

In [None]:
le = LabelEncoder()
df = pd.read_excel('./deptweet_dataset.xlsx', header=0, index_col=None)
df = df[df['confidence_score'] >= 0.65]
df = df.loc[(df['label'] != 'non-depressed') | (df['confidence_score'] >= 0.8)]
df['label'] = df['label'].replace({'mild': 'depressed', 'moderate': 'depressed', 'severe': 'depressed'})
df = df.loc[:, ['tweet', 'label']]
df = df.dropna()
df['label_origin'] = df['label']

In [None]:
df['label'] = le.fit_transform(df['label'])

In [None]:
df.head()

In [None]:
### Balance Dataset

In [None]:
df['label_origin'].value_counts()

In [None]:
df = df.drop(df[df['label_origin'] == 'non-depressed'].sample(n=18000).index)

In [None]:
df['label_origin'].value_counts()

In [None]:
## Preprocessing Steps

In [None]:
### 1. Remove URLs , Hashtags and Mentions

In [None]:
def remove_urls_hashtags_mentions(tweet):
    tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)
    tweet = re.sub(r'@\S+|#\S+', '', tweet)
    return tweet

# Apply remove_urls() function to tweet column
df["tweet"] = df["tweet"].astype(str).apply(lambda x: remove_urls_hashtags_mentions(x))

In [None]:
### 2. Replace emojis and emoticons with their meaning

In [None]:
import emot 
import emoji

#Replace emojis with their meaning
def replace_emojis_with_desc(tweet):
    tweet = emoji.demojize(tweet)
    return tweet

In [None]:
df["tweet"] = df["tweet"].astype(str).apply(lambda x: replace_emojis_with_desc(x))

In [None]:
import emot 
#create emot object
emot_obj = emot.core.emot()

# function to convert emoticons to meanings
def replace_emoticons_with_desc(tweet):
    result = emot_obj.emoticons(tweet)
    if result['flag']:
        for i, emoticon in enumerate(result['value']):
            # replace emoticon with its meaning
            emot_mean = re.split(', | or ', result['mean'][i])[0]
            tweet = tweet.replace(emoticon, emot_mean)
    return tweet

# apply function to dataframe
df['tweet'] = df['tweet'].apply(replace_emoticons_with_desc)

## Process the data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def process_data(row):

    text = row['tweet']
    text = str(text)
    text = ' '.join(text.split())

    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    label = 0
    if row['label_origin'] == 'non-depressed':
        label = 1

    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [None]:
print(process_data({
    'tweet': 'this is a sample review of a movie.',
    'label_origin': 'non-depressed'
}))

In [None]:
processed_data = []

for i in range(len(df[:1000])):
    processed_data.append(process_data(df.iloc[i]))

## Generate the dataset

In [None]:
from sklearn.model_selection import train_test_split

new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(
    new_df,
    test_size=0.2,
    random_state=2022
)

In [None]:
import pyarrow as pa
from datasets import Dataset

train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

## Create a model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="./result", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hg,
    eval_dataset=valid_hg,
    tokenizer=tokenizer
)

## Train and Evaluate the model

In [None]:
trainer.train()

In [32]:
trainer.evaluate()

{'eval_loss': 0.6652752161026001,
 'eval_runtime': 16.6267,
 'eval_samples_per_second': 12.029,
 'eval_steps_per_second': 1.504,
 'epoch': 3.0}

## Save the model

In [33]:
model.save_pretrained('./model/')

## Load the model

In [34]:
from transformers import AutoModelForSequenceClassification

new_model = AutoModelForSequenceClassification.from_pretrained('./model/')

In [35]:
from transformers import AutoTokenizer

new_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

## Get predictions

In [36]:
import torch
import numpy as np

def get_prediction(text):
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = new_model(**encoding)

    logits = outputs.logits

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    probs = probs.detach().numpy()
    label = np.argmax(probs, axis=-1)
    
    if label == 1:
        return {
            'label': 'non-depressed',
            'probability': probs[1]
        }
    else:
        return {
            'label': 'depressed',
            'probability': probs[0]
        }

In [54]:
get_prediction('I am not depressed at all, In fact i am quite happy and delighted.')

{'label': 'depressed', 'probability': 0.6707266}