# Feature engineering

In [44]:
#libraries
import pandas as pd
import numpy as np


In [45]:
# data

df_train = pd.read_csv('train_preprocessed.csv')
df_valid = pd.read_csv('valid_preprocessed.csv')
df_test = pd.read_csv('test_preprocessed.csv')

print(df_train.head())
# print(df_valid.head())
# print(df_test.head())

                                                text  label
0  ['state', 'slow', 'to', 'shut', 'down', 'weak'...      0
1  ['drone', 'place', 'fresh', 'kill', 'on', 'ste...      1
2  ['report', ':', 'majority', 'of', 'instance', ...      1
3  ['sole', 'remain', 'lung', 'fill', 'with', 'ri...      1
4      ['the', 'gop', "'s", 'stockholm', 'syndrome']      0


## Tf-idf vectorization


In [46]:
df_train_str = pd.read_csv('train.csv')['text']
df_valid_str = pd.read_csv('valid.csv')['text']
df_test_str  = pd.read_csv('test.csv')['text']


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase=False)
train_tfidf = vectorizer.fit_transform(df_train_str)
valid_tfidf= vectorizer.transform(df_valid_str)
test_tfidf  = vectorizer.transform(df_test_str)

print(train_tfidf[:10, :]) 


  (0, 16124)	0.3992580875734305
  (0, 6733)	0.33285362959347153
  (0, 20569)	0.3175994757862391
  (0, 22539)	0.3959532813475212
  (0, 6367)	0.25803768539922634
  (0, 18737)	0.35965961259181517
  (0, 20979)	0.10798595426714692
  (0, 19058)	0.3992580875734305
  (0, 19687)	0.32426307516649433
  (1, 10020)	0.27323780791536617
  (1, 22708)	0.27847832528799155
  (1, 14322)	0.13429432618359566
  (1, 19751)	0.3914062605065885
  (1, 14409)	0.1757311845631749
  (1, 11427)	0.35063042205931494
  (1, 8332)	0.42908915986669816
  (1, 15455)	0.3966897313978762
  (1, 6472)	0.4257390088404465
  (2, 22816)	0.2929305151697847
  (2, 22760)	0.2598197862928088
  (2, 2919)	0.3136336167987538
  (2, 22252)	0.2392239245628198
  (2, 688)	0.14737869534929005
  (2, 10303)	0.2598197862928088
  (2, 14299)	0.3373086685585056
  :	:
  (8, 12252)	0.2920963651267997
  (8, 20928)	0.20738491930618924
  (8, 13479)	0.20220106107946975
  (8, 5220)	0.3292318271196743
  (8, 14339)	0.28874743883760884
  (8, 11832)	0.2887474388376

## Static embeddings - hyperparameter: window size, GloVe vs word2vec
word2vec context is interesting, but rare co-occurence can also indicate sarcasm

## Sentiment frequency

In [47]:

import ast
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon', quiet = True)

dftrainSent = pd.read_csv("train_preprocessed.csv")
dfvalidationSent = pd.read_csv("valid_preprocessed.csv")
dftestSent = pd.read_csv("test_preprocessed.csv")


# Convert list → sentence
dftrainSent['text'] = dftrainSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))
dfvalidationSent['text'] = dfvalidationSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))
dftestSent['text'] = dftestSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))


sia = SentimentIntensityAnalyzer()

df_train['sentiment'] = dftrainSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df_valid['sentiment'] = dfvalidationSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df_test['sentiment'] = dftestSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])


print(df_train.head())
print(df_valid.head())
print(df_test.head())

                                                text  label  sentiment
0  ['state', 'slow', 'to', 'shut', 'down', 'weak'...      0    -0.4404
1  ['drone', 'place', 'fresh', 'kill', 'on', 'ste...      1    -0.5267
2  ['report', ':', 'majority', 'of', 'instance', ...      1     0.0000
3  ['sole', 'remain', 'lung', 'fill', 'with', 'ri...      1     0.7650
4      ['the', 'gop', "'s", 'stockholm', 'syndrome']      0     0.0000
                                                text  label  sentiment
0         ['prejudice', 'do', 'not', 'discriminate']      0    -0.5106
1  ['entire', 'house', 'implicate', 'by', 'phish'...      1     0.0000
2  ['lustful', 'man', 'sensually', 'use', 'one', ...      1     0.4939
3  ['area', 'man', 'get', 'terrible', 'creative',...      1    -0.0516
4  ['college', 'graduate', 'first', 'person', 'in...      1    -0.4215
                                                text  label  sentiment
0  ['intuition', 'or', 'ego', '?', '3', 'simple',...      0     0.3400
1  ['p

## Sentence length

In [48]:
dftrainLen = pd.read_csv("train_preprocessed.csv")
dfvalidationLen = pd.read_csv("valid_preprocessed.csv")
dftestLen = pd.read_csv("test_preprocessed.csv")

df_train['length_words'] = dftrainLen['text'].apply(lambda x: len(x.split()))
df_valid['length_words'] = dfvalidationLen['text'].apply(lambda x: len(x.split()))
df_test['length_words'] = dftestLen['text'].apply(lambda x: len(x.split()))
print(df_train.head())
print(df_valid.head())
print(df_test.head())

                                                text  label  sentiment  \
0  ['state', 'slow', 'to', 'shut', 'down', 'weak'...      0    -0.4404   
1  ['drone', 'place', 'fresh', 'kill', 'on', 'ste...      1    -0.5267   
2  ['report', ':', 'majority', 'of', 'instance', ...      1     0.0000   
3  ['sole', 'remain', 'lung', 'fill', 'with', 'ri...      1     0.7650   
4      ['the', 'gop', "'s", 'stockholm', 'syndrome']      0     0.0000   

   length_words  
0             9  
1             9  
2            20  
3             9  
4             5  
                                                text  label  sentiment  \
0         ['prejudice', 'do', 'not', 'discriminate']      0    -0.5106   
1  ['entire', 'house', 'implicate', 'by', 'phish'...      1     0.0000   
2  ['lustful', 'man', 'sensually', 'use', 'one', ...      1     0.4939   
3  ['area', 'man', 'get', 'terrible', 'creative',...      1    -0.0516   
4  ['college', 'graduate', 'first', 'person', 'in...      1    -0.4215   

  

## Punctuation

## first/last word frequency


## Bag of words (n-grams)

In [49]:

# df_train['text'] should be like: [['I', 'love', 'this', '!'], ['So', 'funny', '...'], ...]
from sklearn.feature_extraction.text import CountVectorizer

def identity_tokenizer(tokens):
    return tokens  

vectorizer = CountVectorizer(
    tokenizer=identity_tokenizer,
    preprocessor=lambda x: x, 
    ngram_range=(1,2),
    lowercase=False
)

import ast

df_train['text'] = df_train['text'].apply(ast.literal_eval)
df_valid['text'] = df_valid['text'].apply(ast.literal_eval)
df_test['text']  = df_test['text'].apply(ast.literal_eval)


X_train_bow = vectorizer.fit_transform(df_train['text'])
X_valid_bow = vectorizer.transform(df_valid['text'])
X_test_bow  = vectorizer.transform(df_test['text'])


print("Number of features:", len(vectorizer.get_feature_names_out()))
print("Some features:", vectorizer.get_feature_names_out()[:30])
word_counts = np.asarray(X_train_bow.sum(axis=0)).flatten()

features = vectorizer.get_feature_names_out()
freq_list = list(zip(features, word_counts))

freq_list_sorted = sorted(freq_list, key=lambda x: x[1], reverse=True)

for word, count in freq_list_sorted[:10]:
    print(word, count)




Number of features: 153371
Some features: ['!' '! !' "! '" "! 'westworld" '! (' '! )' '! ,' '! ;' '! again' '! and'
 '! announce' '! british' '! bronco' '! but' '! change' '! check'
 '! dance' '! die' '! disability' '! dumbledore' '! early' '! episode'
 '! for' '! hillary' '! how' '! huffpost' '! in' '! kill' '! man' '! my']
to 6730
of 4679
the 3894
's 3839
in 3455
' 3298
be 3147
, 2978
a 2972
for 2701


## Parts of speech frequency

In [50]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
def get_wordnet_pos(tag):
    if tag.startswith('J'): 
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN   # default

def tokens_to_pos_df(df, token_col='tokens'):
    words = []
    pos_tags = []

    for tokens in df[token_col]:
        tagged = nltk.pos_tag(tokens)   # list of (word, pos)
        for word, pos in tagged:
            words.append(word)
            pos_tags.append(pos)

    return pd.DataFrame({'word': words, 'pos': pos_tags})


pos_df = tokens_to_pos_df(df_train, 'text')
print(pos_df.head())
print(pos_df['pos'].unique())

# POS meanings
'''
NN - Noun, singular (dog, car)
NNS - Noun, plural (dogs, cars)
NNP - Proper noun, singular (London, John)
NNPS - Proper noun, plural (Americans)
PRP - Personal pronoun (I, you, he)
PRP$ - Possessive pronoun (my, your, his)
WP - Wh-pronoun (who, what)
WP$ - Possessive wh-pronoun (whose)
EX - Existential "there" (There is…)

VB - Verb, base form (eat, run)
VBD - Verb, past tense (ate, ran)
VBG - Verb, gerund/present participle (eating, running)
VBN - Verb, past participle (eaten, run)
VBP - Verb, non-3rd person singular present (I eat, they run)
VBZ - Verb, 3rd person singular present (she eats)

JJ - Adjective
JJR - Comparative adjective (bigger)
JJS - Superlative adjective (biggest)
RB - Adverb (quickly)
RBR - Comparative adverb (faster)
RBS - Superlative adverb (fastest)
WRB - Wh-adverb (where, when, why)

IN - Preposition or subordinating conjunction (in, of, because)
TO - "to"
DT - Determiner (the, a)
PDT - Predeterminer (all the kids)
CC - Coordinating conjunction (and, but)
MD - Modal (can, should)

RP - Particle (up, off) — as in "pick up"
POS - Possessive ending ('s)
CD - Cardinal number (1, three)
UH - Interjection (oh, wow)
FW - Foreign word
SYM - Symbol
$ - Dollar sign
# - Pound/hash sign

. - Sentence-final punctuation
, - Comma
: - Colon or semicolon


'''

# I will also add in a high level POS
''' 
N - 
NN
NNS
NNP 
NNPS 
PRP
PRP$
WP 
WP$
EX 

V-
VB 
VBD 
VBG 
VBN
VBP
VBZ

A-
JJ
JJR 
JJS
RB 
RBR
RBS
WRB

P-
IN 
TO 
DT 
PDT 
CC 
MD

O:
RP 
POS 
CD 
UH
FW
SYM 
$
# 

U-
.
, 
:

'''


pos_mapping = {
    # Nouns
    'NN': 'N', 'NNS': 'N', 'NNP': 'N', 'NNPS': 'N', 
    'PRP': 'N', 'PRP$': 'N', 'WP': 'N', 'WP$': 'N', 'EX': 'N',
    
    # Verbs
    'VB': 'V', 'VBD': 'V', 'VBG': 'V', 'VBN': 'V', 'VBP': 'V', 'VBZ': 'V',
    
    # Adjectives / Adverbs
    'JJ': 'A', 'JJR': 'A', 'JJS': 'A', 
    'RB': 'A', 'RBR': 'A', 'RBS': 'A', 'WRB': 'A',
    
    # Prepositions / Determiners / Modals / Conjunctions
    'IN': 'P', 'TO': 'P', 'DT': 'P', 'PDT': 'P', 'CC': 'P', 'MD': 'P',
    
    # Other
    'RP': 'O', 'POS': 'O', 'CD': 'O', 'UH': 'O', 'FW': 'O', 'SYM': 'O', '$': 'O', '#': 'O',
    
    # Punctuation
    '.': 'U', ',': 'U', ':': 'U'
}

# Convert to DataFrame
def words_with_pos(df, token_col='text'):
    """
    Takes a DataFrame with tokenized text and returns a DataFrame with:
    word | fine_pos | high_level_pos
    """
    rows = []
    for tokens in df[token_col]:
        pos_tags = nltk.pos_tag(tokens)
        for word, fine_pos in pos_tags:
            high_pos = pos_mapping.get(fine_pos, 'O')  # default 'O' if not mapped
            rows.append({'word': word, 'fine_pos': fine_pos, 'high_level_pos': high_pos})
    return pd.DataFrame(rows)

# Usage
df_words = words_with_pos(df_train, token_col='text')
print(df_words.head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acotl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\acotl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


    word pos
0  state  NN
1   slow  NN
2     to  TO
3   shut  VB
4   down  RP
['NN' 'TO' 'VB' 'RP' 'JJ' 'IN' ':' 'NNS' 'VBP' 'RB' 'VBG' ',' 'DT' 'POS'
 'CD' 'PRP$' "''" 'JJR' 'MD' 'PRP' 'PDT' 'VBN' 'WRB' 'CC' 'VBZ' 'NNP'
 'JJS' 'RBS' 'WP' '.' '(' ')' 'SYM' 'VBD' 'RBR' 'WDT' '$' '``' 'UH' '#'
 'EX' 'NNPS' 'FW' 'WP$']
    word fine_pos high_level_pos
0  state       NN              N
1   slow       NN              N
2     to       TO              P
3   shut       VB              V
4   down       RP              O


In [51]:
def pos_counts(tokens):
    counts = {'N':0, 'V':0, 'A':0, 'P':0, 'O':0, 'U':0}
    for word, fine_pos in nltk.pos_tag(tokens):
        high_pos = pos_mapping.get(fine_pos, 'O')  # default 'O' if not mapped
        counts[high_pos] += 1
    return counts

# Ensure all text columns are lists
dftrainSent['text'] = dftrainSent['text'].apply(lambda x: x.split() if isinstance(x, str) else x)
dfvalidationSent['text'] = dfvalidationSent['text'].apply(lambda x: x.split() if isinstance(x, str) else x)
dftestSent['text'] = dftestSent['text'].apply(lambda x: x.split() if isinstance(x, str) else x)

# Compute POS counts
pos_features_train = dftrainSent['text'].apply(pos_counts).apply(pd.Series)
pos_features_validation = dfvalidationSent['text'].apply(pos_counts).apply(pd.Series)
pos_features_test = dftestSent['text'].apply(pos_counts).apply(pd.Series)

df_train = pd.concat([df_train, pos_features_train], axis=1)
df_valid = pd.concat([df_valid, pos_features_validation], axis=1)
df_test = pd.concat([df_test, pos_features_test], axis=1)


# Check results
print(df_train.head())
print(df_valid.head())
print(df_test.head())


                                                text  label  sentiment  \
0  [state, slow, to, shut, down, weak, teacher, e...      0    -0.4404   
1  [drone, place, fresh, kill, on, step, of, whit...      1    -0.5267   
2  [report, :, majority, of, instance, of, people...      1     0.0000   
3  [sole, remain, lung, fill, with, rich, ,, sati...      1     0.7650   
4                [the, gop, 's, stockholm, syndrome]      0     0.0000   

   length_words  N  V  A  P  O  U  
0             9  4  1  2  1  1  0  
1             9  5  0  2  2  0  0  
2            20  8  3  3  5  0  1  
3             9  5  0  2  1  0  1  
4             5  2  0  1  1  1  0  
                                                text  label  sentiment  \
0                 [prejudice, do, not, discriminate]      0    -0.5106   
1      [entire, house, implicate, by, phish, poster]      1     0.0000   
2  [lustful, man, sensually, use, one, hand, to, ...      1     0.4939   
3  [area, man, get, terrible, creative, jui