# Feature engineering

In [1]:
#libraries
import pandas as pd
import numpy as np


In [2]:
# data

df_train = pd.read_csv('train_preprocessed.csv')
df_valid = pd.read_csv('valid_preprocessed.csv')
df_test = pd.read_csv('test_preprocessed.csv')

print(df_train.head())
# print(df_valid.head())
# print(df_test.head())

                                                text  label
0  ['state', 'slow', 'to', 'shut', 'down', 'weak'...      0
1  ['drone', 'place', 'fresh', 'kill', 'on', 'ste...      1
2  ['report', ':', 'majority', 'of', 'instance', ...      1
3  ['sole', 'remain', 'lung', 'fill', 'with', 'ri...      1
4      ['the', 'gop', "'s", 'stockholm', 'syndrome']      0


## Tf-idf vectorization


In [3]:
df_train_str = pd.read_csv('train.csv')['text']
df_valid_str = pd.read_csv('valid.csv')['text']
df_test_str  = pd.read_csv('test.csv')['text']


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase=False)
train_tfidf = vectorizer.fit_transform(df_train_str)
valid_tfidf= vectorizer.transform(df_valid_str)
test_tfidf  = vectorizer.transform(df_test_str)

print(train_tfidf[:10, :]) 


  (0, 16124)	0.3992580875734305
  (0, 6733)	0.33285362959347153
  (0, 20569)	0.3175994757862391
  (0, 22539)	0.3959532813475212
  (0, 6367)	0.25803768539922634
  (0, 18737)	0.35965961259181517
  (0, 20979)	0.10798595426714692
  (0, 19058)	0.3992580875734305
  (0, 19687)	0.32426307516649433
  (1, 10020)	0.27323780791536617
  (1, 22708)	0.27847832528799155
  (1, 14322)	0.13429432618359566
  (1, 19751)	0.3914062605065885
  (1, 14409)	0.1757311845631749
  (1, 11427)	0.35063042205931494
  (1, 8332)	0.42908915986669816
  (1, 15455)	0.3966897313978762
  (1, 6472)	0.4257390088404465
  (2, 22816)	0.2929305151697847
  (2, 22760)	0.2598197862928088
  (2, 2919)	0.3136336167987538
  (2, 22252)	0.2392239245628198
  (2, 688)	0.14737869534929005
  (2, 10303)	0.2598197862928088
  (2, 14299)	0.3373086685585056
  :	:
  (8, 12252)	0.2920963651267997
  (8, 20928)	0.20738491930618924
  (8, 13479)	0.20220106107946975
  (8, 5220)	0.3292318271196743
  (8, 14339)	0.28874743883760884
  (8, 11832)	0.2887474388376

## Static embeddings - hyperparameter: window size, GloVe vs word2vec
word2vec context is interesting, but rare co-occurence can also indicate sarcasm

## Sentiment frequency

In [8]:

import ast
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon', quiet = True)

dftrainSent = pd.read_csv("train_preprocessed.csv")
dfvalidationSent = pd.read_csv("valid_preprocessed.csv")
dftestSent = pd.read_csv("test_preprocessed.csv")


# Convert list â†’ sentence
dftrainSent['text'] = dftrainSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))
dfvalidationSent['text'] = dfvalidationSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))
dftestSent['text'] = dftestSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))


sia = SentimentIntensityAnalyzer()

dftrainSent['sentiment'] = dftrainSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
dfvalidationSent['sentiment'] = dfvalidationSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
dftestSent['sentiment'] = dftestSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])


print(dftrainSent.head())
print(dfvalidationSent.head())
print(dftestSent.head())

                                                text  label  sentiment
0  state slow to shut down weak teacher education...      0    -0.4404
1      drone place fresh kill on step of white house      1    -0.5267
2  report : majority of instance of people get li...      1     0.0000
3   sole remain lung fill with rich , satisfy flavor      1     0.7650
4                      the gop 's stockholm syndrome      0     0.0000
                                                text  label  sentiment
0                      prejudice do not discriminate      0    -0.5106
1             entire house implicate by phish poster      1     0.0000
2  lustful man sensually use one hand to unhook c...      1     0.4939
3          area man get terrible creative juice flow      1    -0.0516
4  college graduate first person in family to was...      1    -0.4215
                                                text  label  sentiment
0    intuition or ego ? 3 simple step to reach truth      0     0.3400
1  phy

## Sentence length

In [10]:
dftrainSent['length_words'] = dftrainSent['text'].apply(lambda x: len(x.split()))
dfvalidationSent['length_words'] = dfvalidationSent['text'].apply(lambda x: len(x.split()))
dftestSent['length_words'] = dftestSent['text'].apply(lambda x: len(x.split()))
print(dftrainSent.head())
print(dfvalidationSent.head())
print(dftestSent.head())

                                                text  label  sentiment  \
0  state slow to shut down weak teacher education...      0    -0.4404   
1      drone place fresh kill on step of white house      1    -0.5267   
2  report : majority of instance of people get li...      1     0.0000   
3   sole remain lung fill with rich , satisfy flavor      1     0.7650   
4                      the gop 's stockholm syndrome      0     0.0000   

   length_words  
0             9  
1             9  
2            20  
3             9  
4             5  
                                                text  label  sentiment  \
0                      prejudice do not discriminate      0    -0.5106   
1             entire house implicate by phish poster      1     0.0000   
2  lustful man sensually use one hand to unhook c...      1     0.4939   
3          area man get terrible creative juice flow      1    -0.0516   
4  college graduate first person in family to was...      1    -0.4215   

  

## Punctuation

## first/last word frequency


## Bag of words (n-grams)

## Parts of speech frequency