# extract features for our stacked ensemble model

extra features:
- amount of "!" exclamation marks
- amount of "@" handles
- amount of "#" hashtags
- lenght of sentences
- number of words
- proportion of uppercase characters to lowercase characters
- number of negation words
- polarity of the post (vader sentiment analysis) 
- amount of personal pronouns
- emotion conveying words

https://aclanthology.org/W18-4418/

In [1]:
!pip install NRCLex


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nrclex import NRCLex

In [3]:
nltk.download("vader_lexicon")
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')
sent_analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
df_olid = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/subjectivity mining/olid-train.csv") 
df_hasoc = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/subjectivity mining/hasoc-train.csv") 
df_olid_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/subjectivity mining/olid-test.csv") 

In [5]:
def add_features(df_dataset):
  df_new = df_dataset.copy()
  sentences_of_df = df_dataset["text"]
  
  #proportion of uppercase characters to lowercase characters
  uppercase_percentage = []
  for i in sentences_of_df:
    x_chars = ''.join(i.split())
    x_upper = sum(map(str.isupper, x_chars)) / (len(x_chars))
    uppercase_percentage.append(x_upper)
  df_new["uppercase_percentage"] = uppercase_percentage
  
  #count of characters
  df_new["amount_of_characters"] = sentences_of_df.str.len()

  #get amount of exclamation marks in sentence
  df_new["amount_of_exclamations"] = sentences_of_df.str.count("!")

  #get amount of hastags marks in sentence
  df_new["amount_of_hashtags"] = sentences_of_df.str.count("#")

  #get amount of handles (@) in sentence
  df_new["amount_of_handles"] = sentences_of_df.str.count("@")

  #get amount of words
  count_words = [len(sentences_of_df[i].split()) for i in range(len(df_dataset["text"]))]
  print(sentences_of_df)
  df_new["amount_of_words"] = count_words

  #get amount of personal pronouns
  import re
  pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
  amount_pronous = []
  for i in sentences_of_df:
    amount_pronous.append(len(pronounRegex.findall(i)))

  df_new["amount_personal_pronouns"] = amount_pronous

  #get number of negation words
  negation_words = []

  for i in sentences_of_df:
    doc = nlp(i)
    negation = [tok for tok in doc if tok.dep_ == 'neg']
    negation_words.append(len(negation))
  df_new["amount_negation_words"] = negation_words

  #get emotion conveying words

  df_new['emotions'] = sentences_of_df.apply(lambda x: NRCLex(x).affect_frequencies)
  df_new = pd.concat([df_new.drop(['emotions'], axis = 1), df_new['emotions'].apply(pd.Series)], axis = 1)

  #add polarity of sentence
  polarity = [sent_analyzer.polarity_scores(i) for i in sentences_of_df]
  neg = [i["neg"] for i in polarity]
  neu = [i["neu"] for i in polarity]
  pos = [i["pos"] for i in polarity]
  compound = [i["compound"] for i in polarity]
  df_new["neg_polarity"] = neg
  df_new["neutral_polarity"] = neu
  df_new["pos_polarity"] = pos
  df_new["compound_polarity"] = compound

  df_new = df_new.drop(columns=['text', 'labels'])
  return df_new

In [6]:
new_olid = add_features(df_olid)
new_hasoc = add_features(df_hasoc)
new_olid_test = add_features(df_olid_test)
new_olid_test   

0        @USER She should ask a few native Americans wh...
1        @USER @USER Go home you’re drunk!!! @USER #MAG...
2        Amazon is investigating Chinese employees who ...
3        @USER Someone should'veTaken" this piece of sh...
4        @USER @USER Obama wanted liberals &amp; illega...
                               ...                        
13235    @USER Sometimes I get strong vibes from people...
13236    Benidorm ✅  Creamfields ✅  Maga ✅   Not too sh...
13237    @USER And why report this garbage.  We don't g...
13238                                          @USER Pussy
13239    #Spanishrevenge vs. #justice #HumanRights and ...
Name: text, Length: 13240, dtype: object
0       #DhoniKeepsTheGlove | WATCH: Sports Minister K...
1       @politico No. We should remember very clearly ...
2       @cricketworldcup Guess who would be the winner...
3       Corbyn is too politically intellectual for #Bo...
4       All the best to #TeamIndia for another swimmin...
                    

Unnamed: 0,id,uppercase_percentage,amount_of_characters,amount_of_exclamations,amount_of_hashtags,amount_of_handles,amount_of_words,amount_personal_pronouns,amount_negation_words,fear,...,positive,negative,sadness,disgust,joy,anticipation,neg_polarity,neutral_polarity,pos_polarity,compound_polarity
0,15923,0.269406,245,3,7,1,27,0,0,0.000000,...,0.000000,0.000000,0.00,0.00,0.000000,,0.288,0.638,0.075,-0.8260
1,27014,0.060606,111,0,1,0,13,0,0,1.000000,...,0.000000,0.000000,0.00,0.00,0.000000,,0.223,0.532,0.245,-0.1531
2,30530,0.426901,204,0,14,8,28,0,0,0.000000,...,0.000000,0.000000,0.00,0.00,0.000000,,0.000,1.000,0.000,0.0000
3,13876,0.094017,139,0,3,1,23,0,0,0.000000,...,0.222222,0.000000,0.00,0.00,0.222222,0.222222,0.000,0.810,0.190,0.5719
4,60133,0.121951,98,1,4,0,17,0,0,0.000000,...,0.000000,1.000000,0.00,0.00,0.000000,,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,73439,0.163462,121,0,1,1,18,0,0,0.000000,...,0.000000,0.250000,0.25,0.25,0.000000,,0.137,0.863,0.000,-0.4019
856,25657,0.138756,248,0,2,1,40,1,0,0.090909,...,0.181818,0.090909,0.00,0.00,0.181818,0.181818,0.051,0.845,0.104,0.2960
857,67018,0.000000,205,0,0,0,37,1,0,0.000000,...,0.200000,0.200000,0.00,0.00,0.200000,0.400000,0.182,0.746,0.072,-0.5267
858,50665,0.031111,267,0,2,0,42,0,0,0.125000,...,0.187500,0.187500,0.00,0.00,0.062500,0.125000,0.131,0.869,0.000,-0.7096


In [7]:
new_olid

Unnamed: 0,id,uppercase_percentage,amount_of_characters,amount_of_exclamations,amount_of_hashtags,amount_of_handles,amount_of_words,amount_personal_pronouns,amount_negation_words,fear,...,positive,negative,sadness,disgust,joy,anticipation,neg_polarity,neutral_polarity,pos_polarity,compound_polarity
0,86426,0.103448,71,0,0,1,14,0,0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,,0.000,1.000,0.000,0.0000
1,90194,0.368421,67,3,2,3,11,0,0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,,0.247,0.753,0.000,-0.5067
2,16820,0.141026,182,0,5,0,27,0,0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,,0.000,0.880,0.120,0.3400
3,62688,0.109091,65,0,0,1,11,0,0,0.166667,...,0.0,0.333333,0.0,0.166667,0.0,,0.286,0.714,0.000,-0.5574
4,43605,0.147541,72,0,0,2,12,0,0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13235,95338,0.076190,129,0,0,1,25,1,0,0.500000,...,0.0,0.500000,0.0,0.000000,0.0,,0.241,0.652,0.108,-0.6240
13236,67210,0.085106,62,0,0,0,12,0,1,0.000000,...,0.0,0.500000,0.0,0.500000,0.0,,0.000,1.000,0.000,0.0000
13237,82921,0.130435,57,0,0,1,11,1,1,0.000000,...,0.0,0.500000,0.0,0.500000,0.0,,0.000,0.805,0.195,0.2924
13238,27429,0.500000,11,0,0,1,2,0,0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,,0.000,1.000,0.000,0.0000


In [8]:
new_hasoc.to_csv("Hasoc_extra_features.csv", index=False)
new_olid.to_csv("olid_extra_features.csv", index=False)
new_olid_test.to_csv("olid_test_extra_features.csv", index=False)