<a href="https://colab.research.google.com/github/MarkZuck10/economic-recession-prediction/blob/main/DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Importing Dataset

In [11]:
import pandas as pd

df = pd.read_csv('drive/MyDrive/2023_04_01_rawtext.csv')
df.shape

(85303, 2)

In [12]:
df.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
text,0
tweetid,0


In [13]:
df.duplicated().sum()

0

# 2. Applying DistilBERT

In [14]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

In [15]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def preprocess_tweets(tweets):
    return tokenizer(tweets, padding=True, truncation=True, return_tensors="pt", max_length=128)

In [17]:
def predict_sentiments(data, batch_size=200):
    sentiment_labels = ["Negative", "Neutral", "Positive"]
    predicted_sentiments = []

    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        encoded_tweets = preprocess_tweets(batch)

        with torch.no_grad():
            outputs = model(**encoded_tweets)
            predictions = torch.argmax(outputs.logits, dim=1)
            batch_sentiments = [sentiment_labels[label.item()] for label in predictions]
            predicted_sentiments.extend(batch_sentiments)

    return predicted_sentiments

In [None]:
tweets = df['text'].tolist()

predicted_sentiments = predict_sentiments(tweets, batch_size=200)

In [None]:
df['distilbert_sentiment'] = predicted_sentiments

# 3. Saving dataset

In [None]:
df.shape

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
print(df.duplicated().sum())

In [None]:
df.sample(5)

In [None]:
sentiment_counts = df['distilbert_sentiment'].value_counts()
print(sentiment_counts)

In [None]:
df.to_csv('/content/drive/My Drive/01-04-23_distilBERT-sentiment.csv', index=False)