In [None]:
pip install emot

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from scipy.stats import pearsonr
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from emot.emo_unicode import UNICODE_EMOJI


dataset = pd. read_csv("fear-ratings-0to1.train.txt", sep="\t", names=['id', 'tweet', 'emotion', 'score'])
dataset1 = pd. read_csv("fear-ratings-0to1.dev.gold.txt", sep="\t", names=['id', 'tweet', 'emotion', 'score'])
dataset2 = pd. read_csv("fear-ratings-0to1.test.gold.txt", sep="\t", names=['id', 'tweet', 'emotion', 'score'])
test = pd.read_csv("fear-pred.txt", sep = "\t", names = ['id', 'tweet', 'emotion', 'score'])
train = pd.concat([dataset, dataset1,dataset2], ignore_index = True)

X = train.tweet
y = train.score
X_test = test.tweet

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text
# Define the preprocessing function
def preprocess_text(text):
    # Remove HTML tags and special characters
    text = convert_emojis(text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>",text)
    # FLAGS = re.MULTILINE | re.DOTALL
    eyes = r"[8:=;]"
    nose = r"['`\-]?"
    text = re.sub(r"{}{}p+".format(eyes, nose), "<lolface>",text)
    text = re.sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>",text)
    text = re.sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>",text)
    text = re.sub(r"<3","<heart>",text)
    text = re.sub(r"#\S+", lambda hashtag: " ".join(segment(hashtag.group()[1:])),text)


    # Convert to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(tokens)

    return processed_text

# Define the pipeline
pipeline = Pipeline([
    ('preprocess', FunctionTransformer(lambda x: [preprocess_text(text) for text in x], validate=False)),
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])



# Train the model
pipeline.fit(X, y)

# Make predictions on the validation set
predictions = pipeline.predict(X_test)
test['score'] = predictions

test.to_csv('fear-prediction.txt', index = False)

Model using Deep Learning

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from scipy.stats import pearsonr, spearmanr
from transformers import BertTokenizer, TFBertModel

data = pd.read_csv("fear-ratings-0to1.train.txt", sep="\t", names=['id', 'tweet', 'emotion', 'score'])

# Preprocess the data
X = data['tweet'].values
y = data['score'].values

# Tokenize and pad sequences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(X.tolist(), padding=True, truncation=True, return_tensors='tf', max_length=128)

# Load BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Extract BERT embeddings
embeddings = bert_model(tokens.input_ids).last_hidden_state[:, 0, :]

# Split the data into training and testing sets
X_train_embeddings, X_test_embeddings, y_train, y_test = train_test_split(embeddings.numpy(), y, test_size=0.2, random_state=42)

# Build a simple feedforward neural network with regression head
model = Sequential([
    Dense(64, activation='relu', input_shape=(768,)),
    Dense(1, activation='linear')
])

# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model
model.fit(X_train_embeddings, y_train, epochs=100, batch_size=32, validation_data=(X_test_embeddings, y_test))

# Calculate overall correlation on the entire dataset
y_pred_all = np.squeeze(model.predict(embeddings.numpy()))
overall_pearson_corr, _ = pearsonr(y, y_pred_all)
overall_spearman_corr, _ = spearmanr(y, y_pred_all)

print(f'Overall Pearson Correlation: {overall_pearson_corr:.4f}, Overall Spearman Correlation: {overall_spearman_corr:.4f}')
