# Final Project Group 52
- Guido Takkenberg
- Neil Bonnard
- Ilyas el Haroui
- Luis Blanco

### 0.1 Imports

In [1]:
import pandas as pd
from collections import Counter 
from nltk.corpus.reader import ConllCorpusReader
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn import svm
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import expit
from torch.nn.functional import softmax
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TextClassificationPipeline
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

  from pandas.core import (
  torch.utils._pytree._register_pytree_node(


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ilyas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilyas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ilyas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 0.2 Helper methods

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    cleaned_text = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned_text)

In [4]:
def pos_tag_text(text):
    tokens = word_tokenize(text)
    return pos_tag(tokens)

## 1. NERC

### 1.1 Train Data preprocessing

In [23]:
train_data_nerc = ConllCorpusReader('./datasets/nerc_dataset/train', 'train.txt', ['words', 'pos', 'ignore', 'chunk'])

train_features = []
train_gold_labels = []

for token, pos, ne_label in train_data_nerc.iob_words():
    train_features.append({"token": token, "pos": pos})
    train_gold_labels.append(ne_label)

### 1.2 Test Data preprocessing

In [32]:
nerc_test_dataset = './datasets/nerc_dataset/test/NER-test.tsv'
test_data = pd.read_csv(nerc_test_dataset, sep='\t', header=0)

# Preprocess tokens
test_data['preprocessed_token'] = test_data['token'].apply(preprocess_text)

# Map from test labels to training labels
label_mapping = {
    'B-PERSON': 'B-PER',
    'I-PERSON': 'I-PER',
    'B-ORG': 'B-ORG',
    'I-ORG': 'I-ORG',
    'B-WORK_OF_ART': 'B-MISC',  
    'I-WORK_OF_ART': 'I-MISC',  
    'B-DATE': 'B-MISC',  
    'I-DATE': 'I-MISC'
}

test_data['BIO NER tag'] = test_data['BIO NER tag'].map(label_mapping).fillna(test_data['BIO NER tag'])

# Apply POS tagging to the token column
test_data['POS'] = test_data['token'].apply(lambda x: pos_tag_text(x)[0][1])

In [34]:
test_features = []
test_gold_labels = []
for _, instance in test_data.iterrows():
    test_features.append({"token": instance['token'], "pos": instance['POS']})

test_gold_labels = list(test_data['BIO NER tag'])

### 1.3 Implementation

In [35]:
concatenated_data = train_features + test_features

vec = DictVectorizer()
transformed_vector = vec.fit_transform(concatenated_data)

train_features_new = transformed_vector[:len(train_features)]
test_features_new = transformed_vector[len(train_features):]

lin_clf = svm.LinearSVC()
lin_clf.fit(train_features_new, train_gold_labels)

predicted_labels = lin_clf.predict(test_features_new)
target_names = list(set(train_gold_labels + test_gold_labels))



### 1.4 Results

In [36]:
unique_labels = set(test_gold_labels + list(predicted_labels))
report = classification_report(test_gold_labels, predicted_labels, labels=list(unique_labels), target_names=list(unique_labels))

print(report)

              precision    recall  f1-score   support

       I-PER       0.00      0.00      0.00         3
       I-LOC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         3
           O       0.85      1.00      0.92       160
      I-MISC       0.00      0.00      0.00        10
       B-LOC       0.00      0.00      0.00         0
       I-ORG       0.00      0.00      0.00         6
      B-MISC       0.00      0.00      0.00         5
       B-PER       1.00      0.33      0.50         6

    accuracy                           0.84       193
   macro avg       0.21      0.15      0.16       193
weighted avg       0.73      0.84      0.78       193



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 2. Sentiment Analysis

### 2.1 Train data preprocessing

In [37]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

sentiment_pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer)

### 2.2 Test data preprocessing

In [48]:
sentiment_topic_dataset = './datasets/sentiment_topic_dataset/test/sentiment-topic-test.tsv'
sentiment_topic_test_data = pd.read_csv(sentiment_topic_dataset, sep='\t', header=0)

texts = sentiment_topic_test_data['text'].tolist()

sentiment_gold_labels = sentiment_topic_test_data['sentiment'].tolist()

# Apply preprocessing to each text in the list
preprocessed_texts = [preprocess_text(text) for text in texts]

### 2.3 Implementation

In [49]:
predictions = sentiment_pipeline(preprocessed_texts)

# Map predictions to labels based on the scores
predicted_labels = []
for prediction in predictions:
    label = prediction['label']
    if label == 'LABEL_0':
        predicted_labels.append('negative')
    elif label == 'LABEL_1':
        predicted_labels.append('neutral')
    elif label == 'LABEL_2':
        predicted_labels.append('positive')

### 2.4 Results

In [42]:
report = classification_report(sentiment_gold_labels, predicted_labels, labels=['positive', 'negative', 'neutral'])
print(report)

              precision    recall  f1-score   support

    positive       0.50      0.33      0.40         3
    negative       0.50      0.25      0.33         4
     neutral       0.33      0.67      0.44         3

    accuracy                           0.40        10
   macro avg       0.44      0.42      0.39        10
weighted avg       0.45      0.40      0.39        10



## 3. Topic Analysis using 2 systems: Transformer model & Naive Bayes

### 3.1 BERT

In [43]:
MODEL = "cardiffnlp/tweet-topic-latest-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

class_mapping = model.config.id2label

predicted_topics = []

for text in sentiment_topic_test_data['text']:
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    output = model(**tokens)
    scores = output[0][0].detach().numpy()
    scores = expit(scores)
    predictions = (scores >= 0.5) * 1
    
    # Map predictions to classes
    predicted_labels = [class_mapping[i] for i in range(len(predictions)) if predictions[i]]
    predicted_topics.append(predicted_labels)

### 3.1.1 Results

In [44]:
sentiment_topic_test_data['predicted_topics'] = predicted_topics

print(sentiment_topic_test_data[['text', 'predicted_topics']])

                                                text  \
0  I wouldn't be caught dead watching the NFL if ...   
1  Chris O'Donnell stated that while filming for ...   
2  The whole game was a rollercoaster ride, but L...   
3  Zendaya slayed in Dune 2, as she does in all h...   
4  While my favorite player was playing this matc...   
5  My uncle's brother's neighbor's cat's veterina...   
6  He said that The Great Gatsby is the best nove...   
7  I could not look away from this train wrck of ...   
8  The film Everything Everywhere All At Once fol...   
9  I just finished reading pride and prejudice wh...   

                                predicted_topics  
0                                       [sports]  
1     [celebrity_&_pop_culture, film_tv_&_video]  
2                                       [sports]  
3     [celebrity_&_pop_culture, film_tv_&_video]  
4                                       [sports]  
5  [diaries_&_daily_life, news_&_social_concern]  
6     [celebrity_&_pop_cul

### 3.2 Naive Bayes

In [45]:
naive_bayes_dataset = load_dataset("valurank/Topic_Classification")

# Remove None values
naive_bayes_dataset['train'] = naive_bayes_dataset['train'].filter(lambda example: example['article_text'] is not None)

# Apply preprocessing
naive_bayes_dataset['train'] = naive_bayes_dataset['train'].map(lambda x: {'article_text': preprocess_text(x['article_text'])})

train_data, validation_data = train_test_split(naive_bayes_dataset['train'], test_size=0.1, random_state=42)

X_train = train_data['article_text']
y_train = train_data['topic']
X_val = validation_data['article_text']
y_val = validation_data['topic']

X_test = sentiment_topic_test_data['text'].apply(preprocess_text)
y_test = sentiment_topic_test_data['topic']

Found cached dataset csv (C:/Users/ilyas/.cache/huggingface/datasets/valurank___csv/valurank--Topic_Classification-31f87df3854a46bd/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\ilyas\.cache\huggingface\datasets\valurank___csv\valurank--Topic_Classification-31f87df3854a46bd\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-02e088ba313b0923.arrow


Map:   0%|          | 0/22462 [00:00<?, ? examples/s]

In [46]:
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

# Train the Naive Bayes model
nb_pipeline.fit(X_train, y_train)

y_val_pred = nb_pipeline.predict(X_val)
y_test_pred = nb_pipeline.predict(X_test)

### 3.2.1 Results

In [47]:
sentiment_topic_test_data['predicted_sentiment'] = y_test_pred

print(sentiment_topic_test_data[['text', 'predicted_sentiment']])

                                                text  \
0  I wouldn't be caught dead watching the NFL if ...   
1  Chris O'Donnell stated that while filming for ...   
2  The whole game was a rollercoaster ride, but L...   
3  Zendaya slayed in Dune 2, as she does in all h...   
4  While my favorite player was playing this matc...   
5  My uncle's brother's neighbor's cat's veterina...   
6  He said that The Great Gatsby is the best nove...   
7  I could not look away from this train wrck of ...   
8  The film Everything Everywhere All At Once fol...   
9  I just finished reading pride and prejudice wh...   

              predicted_sentiment  
0                        Football  
1                          Movies  
2                      Basketball  
3                       Celebrity  
4                        Football  
5  Extreme Weather and Cataclysms  
6                        Football  
7  Extreme Weather and Cataclysms  
8  Extreme Weather and Cataclysms  
9  Extreme Weather and 