# Feature engineering

In [193]:
#libraries
import pandas as pd
import numpy as np
import re
from collections import Counter
import ast


In [194]:
# data

df_train = pd.read_csv('train_preprocessed.csv')
df_valid = pd.read_csv('valid_preprocessed.csv')
df_test = pd.read_csv('test_preprocessed.csv')

print(df_train.head())
# print(df_valid.head())
# print(df_test.head())

                                                text  label
0  ['state', 'slow', 'to', 'shut', 'down', 'weak'...      0
1  ['drone', 'place', 'fresh', 'kill', 'on', 'ste...      1
2  ['report', ':', 'majority', 'of', 'instance', ...      1
3  ['sole', 'remain', 'lung', 'fill', 'with', 'ri...      1
4      ['the', 'gop', "'s", 'stockholm', 'syndrome']      0


## Tf-idf vectorization


In [195]:
df_train_str = pd.read_csv('train.csv')['text']
df_valid_str = pd.read_csv('valid.csv')['text']
df_test_str  = pd.read_csv('test.csv')['text']


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase=False)
train_tfidf = vectorizer.fit_transform(df_train_str)
valid_tfidf= vectorizer.transform(df_valid_str)
test_tfidf  = vectorizer.transform(df_test_str)

print(train_tfidf[:10, :]) 


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 96 stored elements and shape (10, 23222)>
  Coords	Values
  (0, 19687)	0.32426307516649433
  (0, 19058)	0.3992580875734305
  (0, 20979)	0.10798595426714692
  (0, 18737)	0.35965961259181517
  (0, 6367)	0.25803768539922634
  (0, 22539)	0.3959532813475212
  (0, 20569)	0.3175994757862391
  (0, 6733)	0.33285362959347153
  (0, 16124)	0.3992580875734305
  (1, 6472)	0.4257390088404465
  (1, 15455)	0.3966897313978762
  (1, 8332)	0.42908915986669816
  (1, 11427)	0.35063042205931494
  (1, 14409)	0.1757311845631749
  (1, 19751)	0.3914062605065885
  (1, 14322)	0.13429432618359566
  (1, 22708)	0.27847832528799155
  (1, 10020)	0.27323780791536617
  (2, 20979)	0.0770000428999051
  (2, 14409)	0.11562957690780681
  (2, 14322)	0.1767289756377389
  (2, 17158)	0.16578859502292492
  (2, 12480)	0.24121415038283242
  (2, 10690)	0.35115768604059283
  (2, 15151)	0.17922721469222705
  :	:
  (8, 7472)	0.370173061967635
  (8, 15007)	0.3839689469174912
 

## Static embeddings - hyperparameter: window size, GloVe vs word2vec
word2vec context is interesting, but rare co-occurence can also indicate sarcasm

## Sentiment frequency

In [196]:

import ast
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon', quiet = True)

dftrainSent = pd.read_csv("train_preprocessed.csv")
dfvalidationSent = pd.read_csv("valid_preprocessed.csv")
dftestSent = pd.read_csv("test_preprocessed.csv")


# Convert list → sentence
dftrainSent['text'] = dftrainSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))
dfvalidationSent['text'] = dfvalidationSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))
dftestSent['text'] = dftestSent['text'].apply(lambda x: " ".join(ast.literal_eval(x)))


sia = SentimentIntensityAnalyzer()

df_train['sentiment'] = dftrainSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df_valid['sentiment'] = dfvalidationSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df_test['sentiment'] = dftestSent['text'].apply(lambda x: sia.polarity_scores(x)['compound'])


print(df_train.head())
print(df_valid.head())
print(df_test.head())

                                                text  label  sentiment
0  ['state', 'slow', 'to', 'shut', 'down', 'weak'...      0    -0.4404
1  ['drone', 'place', 'fresh', 'kill', 'on', 'ste...      1    -0.5267
2  ['report', ':', 'majority', 'of', 'instance', ...      1     0.0000
3  ['sole', 'remain', 'lung', 'fill', 'with', 'ri...      1     0.7650
4      ['the', 'gop', "'s", 'stockholm', 'syndrome']      0     0.0000
                                                text  label  sentiment
0         ['prejudice', 'do', 'not', 'discriminate']      0    -0.5106
1  ['entire', 'house', 'implicate', 'by', 'phish'...      1     0.0000
2  ['lustful', 'man', 'sensually', 'use', 'one', ...      1     0.4939
3  ['area', 'man', 'get', 'terrible', 'creative',...      1    -0.0516
4  ['college', 'graduate', 'first', 'person', 'in...      1    -0.4215
                                                text  label  sentiment
0  ['intuition', 'or', 'ego', '?', '3', 'simple',...      0     0.3400
1  ['p

## Sentence length

In [197]:
dftrainLen = pd.read_csv("train_preprocessed.csv")
dfvalidationLen = pd.read_csv("valid_preprocessed.csv")
dftestLen = pd.read_csv("test_preprocessed.csv")

df_train['length_words'] = dftrainLen['text'].apply(lambda x: len(x.split()))
df_valid['length_words'] = dfvalidationLen['text'].apply(lambda x: len(x.split()))
df_test['length_words'] = dftestLen['text'].apply(lambda x: len(x.split()))
print(df_train.head())
print(df_valid.head())
print(df_test.head())

                                                text  label  sentiment  \
0  ['state', 'slow', 'to', 'shut', 'down', 'weak'...      0    -0.4404   
1  ['drone', 'place', 'fresh', 'kill', 'on', 'ste...      1    -0.5267   
2  ['report', ':', 'majority', 'of', 'instance', ...      1     0.0000   
3  ['sole', 'remain', 'lung', 'fill', 'with', 'ri...      1     0.7650   
4      ['the', 'gop', "'s", 'stockholm', 'syndrome']      0     0.0000   

   length_words  
0             9  
1             9  
2            20  
3             9  
4             5  
                                                text  label  sentiment  \
0         ['prejudice', 'do', 'not', 'discriminate']      0    -0.5106   
1  ['entire', 'house', 'implicate', 'by', 'phish'...      1     0.0000   
2  ['lustful', 'man', 'sensually', 'use', 'one', ...      1     0.4939   
3  ['area', 'man', 'get', 'terrible', 'creative',...      1    -0.0516   
4  ['college', 'graduate', 'first', 'person', 'in...      1    -0.4215   

  

## Punctuation

In [198]:
df_train_str = pd.read_csv('train.csv')['text']
df_valid_str = pd.read_csv('valid.csv')['text']
df_test_str  = pd.read_csv('test.csv')['text']

def extract_punctuation_features(text):
    features = {}
    features['exclamation_count'] = text.count('!')
    features['question_count'] = text.count('?')
    features['ellipsis_count'] = len(re.findall(r'\.{2,}', text))  # Two or more dots
    features['quote_count'] = text.count('"') + text.count("'")
    features['comma_count'] = text.count(',')
    features['period_count'] = text.count('.')
    features['semicolon_count'] = text.count(';')
    features['colon_count'] = text.count(':')
    features['dash_count'] = text.count('-') + text.count('—')
    features['multiple_exclamation'] = len(re.findall(r'!{2,}', text))
    features['multiple_question'] = len(re.findall(r'\?{2,}', text))
    features['mixed_punctuation'] = len(re.findall(r'[!?]{2,}', text))
    total_punct = sum([features['exclamation_count'], features['question_count'],
                       features['comma_count'], features['period_count']])
    text_length = len(text.split())
    features['punct_density'] = total_punct / max(text_length, 1)
    features['all_caps_words'] = len(re.findall(r'\b[A-Z]{2,}\b', text))
    
    return features

def create_punctuation_features(text_series):
    punct_features = text_series.apply(extract_punctuation_features) 
    return pd.DataFrame(punct_features.tolist())

train_punct = create_punctuation_features(df_train_str)
valid_punct = create_punctuation_features(df_valid_str)
test_punct = create_punctuation_features(df_test_str)

print("Punctuation features shape:", train_punct.shape)
print("\nSample punctuation features:")
print(train_punct.head())

Punctuation features shape: (21464, 14)

Sample punctuation features:
   exclamation_count  question_count  ellipsis_count  quote_count  \
0                  0               0               0            0   
1                  0               0               0            0   
2                  0               0               0            0   
3                  0               0               0            0   
4                  0               0               0            1   

   comma_count  period_count  semicolon_count  colon_count  dash_count  \
0            0             0                0            0           0   
1            0             0                0            0           0   
2            0             0                0            1           0   
3            1             0                0            0           0   
4            0             0                0            0           0   

   multiple_exclamation  multiple_question  mixed_punctuation  punct_d

## first/last word frequency


In [199]:
# df_train = pd.read_csv('train_preprocessed.csv')
# df_valid = pd.read_csv('valid_preprocessed.csv')
# df_test = pd.read_csv('test_preprocessed.csv')
def parse_token_list(text_str):
    try:
        return ast.literal_eval(text_str)
    except:
        return []
    
def get_first_last_words(df):
    tokens = df['text'].apply(parse_token_list)
    
    first_words = tokens.apply(lambda x: x[0] if len(x) > 0 else '<EMPTY>')
    last_words = tokens.apply(lambda x: x[-1] if len(x) > 0 else '<EMPTY>')
    
    return first_words, last_words

print("Extracting first/last words...")
train_first, train_last = get_first_last_words(df_train)
valid_first, valid_last = get_first_last_words(df_valid)
test_first, test_last = get_first_last_words(df_test)


def build_position_vocabulary(words_series, top_n=200, min_freq=5):
    
    word_counts = Counter(words_series)
    
    filtered_words = {word: count for word, count in word_counts.items() 
                     if count >= min_freq}
    
    vocab = [word for word, _ in sorted(filtered_words.items(), 
                                       key=lambda x: x[1], 
                                       reverse=True)[:top_n]]
    
    return vocab

first_word_vocab = build_position_vocabulary(train_first, top_n=200, min_freq=5)
last_word_vocab = build_position_vocabulary(train_last, top_n=200, min_freq=5)

print(f"\nFirst word vocabulary size: {len(first_word_vocab)}")
print(f"Last word vocabulary size: {len(last_word_vocab)}")

print("\nMost common first words:", first_word_vocab[:20])
print("\nMost common last words:", last_word_vocab[:20])

def create_position_features(first_words, last_words, first_vocab, last_vocab):
    
    features = {}
    
    for word in first_vocab:
        features[f'first_{word}'] = (first_words == word).astype(int)
    
    for word in last_vocab:
        features[f'last_{word}'] = (last_words == word).astype(int)
    
    return pd.DataFrame(features)

train_position = create_position_features(train_first, train_last, 
                                         first_word_vocab, last_word_vocab)
valid_position = create_position_features(valid_first, valid_last,
                                         first_word_vocab, last_word_vocab)
test_position = create_position_features(test_first, test_last,
                                        first_word_vocab, last_word_vocab)

print("Position features shape:", train_position.shape)
print("\nSample position features:")
print(train_position.head())


Extracting first/last words...

First word vocabulary size: 200
Last word vocabulary size: 200

Most common first words: ['the', 'man', 'new', 'report', 'trump', 'how', 'area', 'this', 'why', 'a', 'woman', 'what', 'nation', 'obama', 'donald', 'study', 'watch', 'here', 'u.s.', '5']

Most common last words: ["'", '?', '.', 'day', 'year', ')', 'time', 'it', 'life', 'say', 'trump', '!', 'woman', 'him', 'again', 'now', 'week', 'you', 'child', 'report']
Position features shape: (21464, 400)

Sample position features:
   first_the  first_man  first_new  first_report  first_trump  first_how  \
0          0          0          0             0            0          0   
1          0          0          0             0            0          0   
2          0          0          0             1            0          0   
3          0          0          0             0            0          0   
4          1          0          0             0            0          0   

   first_area  first_this 

## Bag of words (n-grams)

In [200]:

# df_train['text'] should be like: [['I', 'love', 'this', '!'], ['So', 'funny', '...'], ...]
from sklearn.feature_extraction.text import CountVectorizer

def identity_tokenizer(tokens):
    return tokens  

vectorizer = CountVectorizer(
    tokenizer=identity_tokenizer,
    preprocessor=lambda x: x, 
    ngram_range=(1,2),
    lowercase=False
)

import ast

df_train['text'] = df_train['text'].apply(ast.literal_eval)
df_valid['text'] = df_valid['text'].apply(ast.literal_eval)
df_test['text']  = df_test['text'].apply(ast.literal_eval)


X_train_bow = vectorizer.fit_transform(df_train['text'])
X_valid_bow = vectorizer.transform(df_valid['text'])
X_test_bow  = vectorizer.transform(df_test['text'])


print("Number of features:", len(vectorizer.get_feature_names_out()))
print("Some features:", vectorizer.get_feature_names_out()[:30])
word_counts = np.asarray(X_train_bow.sum(axis=0)).flatten()

features = vectorizer.get_feature_names_out()
freq_list = list(zip(features, word_counts))

freq_list_sorted = sorted(freq_list, key=lambda x: x[1], reverse=True)

for word, count in freq_list_sorted[:10]:
    print(word, count)




Number of features: 153371
Some features: ['!' '! !' "! '" "! 'westworld" '! (' '! )' '! ,' '! ;' '! again' '! and'
 '! announce' '! british' '! bronco' '! but' '! change' '! check'
 '! dance' '! die' '! disability' '! dumbledore' '! early' '! episode'
 '! for' '! hillary' '! how' '! huffpost' '! in' '! kill' '! man' '! my']
to 6730
of 4679
the 3894
's 3839
in 3455
' 3298
be 3147
, 2978
a 2972
for 2701


## Parts of speech frequency

In [201]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
def get_wordnet_pos(tag):
    if tag.startswith('J'): 
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN   # default

def tokens_to_pos_df(df, token_col='tokens'):
    words = []
    pos_tags = []

    for tokens in df[token_col]:
        tagged = nltk.pos_tag(tokens)   # list of (word, pos)
        for word, pos in tagged:
            words.append(word)
            pos_tags.append(pos)

    return pd.DataFrame({'word': words, 'pos': pos_tags})


pos_df = tokens_to_pos_df(df_train, 'text')
print(pos_df.head())
print(pos_df['pos'].unique())

# POS meanings
'''
NN - Noun, singular (dog, car)
NNS - Noun, plural (dogs, cars)
NNP - Proper noun, singular (London, John)
NNPS - Proper noun, plural (Americans)
PRP - Personal pronoun (I, you, he)
PRP$ - Possessive pronoun (my, your, his)
WP - Wh-pronoun (who, what)
WP$ - Possessive wh-pronoun (whose)
EX - Existential "there" (There is…)

VB - Verb, base form (eat, run)
VBD - Verb, past tense (ate, ran)
VBG - Verb, gerund/present participle (eating, running)
VBN - Verb, past participle (eaten, run)
VBP - Verb, non-3rd person singular present (I eat, they run)
VBZ - Verb, 3rd person singular present (she eats)

JJ - Adjective
JJR - Comparative adjective (bigger)
JJS - Superlative adjective (biggest)
RB - Adverb (quickly)
RBR - Comparative adverb (faster)
RBS - Superlative adverb (fastest)
WRB - Wh-adverb (where, when, why)

IN - Preposition or subordinating conjunction (in, of, because)
TO - "to"
DT - Determiner (the, a)
PDT - Predeterminer (all the kids)
CC - Coordinating conjunction (and, but)
MD - Modal (can, should)

RP - Particle (up, off) — as in "pick up"
POS - Possessive ending ('s)
CD - Cardinal number (1, three)
UH - Interjection (oh, wow)
FW - Foreign word
SYM - Symbol
$ - Dollar sign
# - Pound/hash sign

. - Sentence-final punctuation
, - Comma
: - Colon or semicolon


'''

# I will also add in a high level POS
''' 
N - 
NN
NNS
NNP 
NNPS 
PRP
PRP$
WP 
WP$
EX 

V-
VB 
VBD 
VBG 
VBN
VBP
VBZ

A-
JJ
JJR 
JJS
RB 
RBR
RBS
WRB

P-
IN 
TO 
DT 
PDT 
CC 
MD

O:
RP 
POS 
CD 
UH
FW
SYM 
$
# 

U-
.
, 
:

'''


pos_mapping = {
    # Nouns
    'NN': 'N', 'NNS': 'N', 'NNP': 'N', 'NNPS': 'N', 
    'PRP': 'N', 'PRP$': 'N', 'WP': 'N', 'WP$': 'N', 'EX': 'N',
    
    # Verbs
    'VB': 'V', 'VBD': 'V', 'VBG': 'V', 'VBN': 'V', 'VBP': 'V', 'VBZ': 'V',
    
    # Adjectives / Adverbs
    'JJ': 'A', 'JJR': 'A', 'JJS': 'A', 
    'RB': 'A', 'RBR': 'A', 'RBS': 'A', 'WRB': 'A',
    
    # Prepositions / Determiners / Modals / Conjunctions
    'IN': 'P', 'TO': 'P', 'DT': 'P', 'PDT': 'P', 'CC': 'P', 'MD': 'P',
    
    # Other
    'RP': 'O', 'POS': 'O', 'CD': 'O', 'UH': 'O', 'FW': 'O', 'SYM': 'O', '$': 'O', '#': 'O',
    
    # Punctuation
    '.': 'U', ',': 'U', ':': 'U'
}

# Convert to DataFrame
def words_with_pos(df, token_col='text'):
    """
    Takes a DataFrame with tokenized text and returns a DataFrame with:
    word | fine_pos | high_level_pos
    """
    rows = []
    for tokens in df[token_col]:
        pos_tags = nltk.pos_tag(tokens)
        for word, fine_pos in pos_tags:
            high_pos = pos_mapping.get(fine_pos, 'O')  # default 'O' if not mapped
            rows.append({'word': word, 'fine_pos': fine_pos, 'high_level_pos': high_pos})
    return pd.DataFrame(rows)

# Usage
df_words = words_with_pos(df_train, token_col='text')
print(df_words.head())


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michelleshlivko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/michelleshlivko/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


    word pos
0  state  NN
1   slow  NN
2     to  TO
3   shut  VB
4   down  RP
['NN' 'TO' 'VB' 'RP' 'JJ' 'IN' ':' 'NNS' 'VBP' 'RB' 'VBG' ',' 'DT' 'POS'
 'CD' 'PRP$' "''" 'JJR' 'MD' 'PRP' 'PDT' 'VBN' 'WRB' 'CC' 'VBZ' 'NNP'
 'JJS' 'RBS' 'WP' '.' '(' ')' 'SYM' 'VBD' 'RBR' 'WDT' '$' '``' 'UH' '#'
 'EX' 'NNPS' 'FW' 'WP$']
    word fine_pos high_level_pos
0  state       NN              N
1   slow       NN              N
2     to       TO              P
3   shut       VB              V
4   down       RP              O


In [202]:


import ast
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger_eng')



# Convert string representation of list into actual list
# df_train['tokens'] = df_train['text'].apply(ast.literal_eval)

# Function to convert POS tags to high-level POS
def get_high_level_pos(tag):
    if tag.startswith('N'):
        return 'N'
    elif tag.startswith('V'):
        return 'V'
    elif tag.startswith('J') or tag.startswith('R'):
        return 'A'
    elif tag in ['IN','TO','DT','PDT','CC','MD']:
        return 'P'
    elif tag in ['RP','POS','CD','UH','FW','SYM','$','#']:
        return 'O'
    elif tag in ['.',';',',',':']:
        return 'U'
    else:
        return 'O'

# Tag tokens in context and create new column
def tag_tokens(tokens):
    tagged = pos_tag(tokens)  # [('word','POS'), ...]
    high_level = [get_high_level_pos(tag) for _, tag in tagged]
    return high_level

df_train['pos_seq'] = df_train['text'].apply(tag_tokens)

count = 0
for tokens, pos_seq in zip(df_train['text'], df_train['pos_seq']):
    if count < 5:
        print(list(zip(tokens, pos_seq)))
        count += 1
    else:
        break



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/michelleshlivko/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('state', 'N'), ('slow', 'N'), ('to', 'P'), ('shut', 'V'), ('down', 'A'), ('weak', 'A'), ('teacher', 'A'), ('education', 'N'), ('program', 'N')]
[('drone', 'N'), ('place', 'N'), ('fresh', 'A'), ('kill', 'N'), ('on', 'P'), ('step', 'N'), ('of', 'P'), ('white', 'A'), ('house', 'N')]
[('report', 'N'), (':', 'U'), ('majority', 'N'), ('of', 'P'), ('instance', 'N'), ('of', 'P'), ('people', 'N'), ('get', 'V'), ('life', 'N'), ('back', 'A'), ('on', 'P'), ('track', 'N'), ('occur', 'N'), ('immediately', 'A'), ('after', 'P'), ('visit', 'N'), ('to', 'P'), ('buffalo', 'V'), ('wild', 'A'), ('wing', 'V')]
[('sole', 'N'), ('remain', 'N'), ('lung', 'N'), ('fill', 'N'), ('with', 'P'), ('rich', 'A'), (',', 'U'), ('satisfy', 'A'), ('flavor', 'N')]
[('the', 'P'), ('gop', 'N'), ("'s", 'O'), ('stockholm', 'A'), ('syndrome', 'N')]


In [203]:
def pos_counts(tokens):
    counts = {'N':0, 'V':0, 'A':0, 'P':0, 'O':0, 'U':0}
    for word, fine_pos in nltk.pos_tag(tokens):
        high_pos = pos_mapping.get(fine_pos, 'O')  # default 'O' if not mapped
        counts[high_pos] += 1
    return counts

# Ensure all text columns are lists
dftrainSent['text'] = dftrainSent['text'].apply(lambda x: x.split() if isinstance(x, str) else x)
dfvalidationSent['text'] = dfvalidationSent['text'].apply(lambda x: x.split() if isinstance(x, str) else x)
dftestSent['text'] = dftestSent['text'].apply(lambda x: x.split() if isinstance(x, str) else x)


# Compute POS counts
pos_features_train = dftrainSent['text'].apply(pos_counts).apply(pd.Series)
pos_features_validation = dfvalidationSent['text'].apply(pos_counts).apply(pd.Series)
pos_features_test = dftestSent['text'].apply(pos_counts).apply(pd.Series)

df_train = pd.concat([df_train, pos_features_train], axis=1)
df_valid = pd.concat([df_valid, pos_features_validation], axis=1)
df_test = pd.concat([df_test, pos_features_test], axis=1)


# Check results
print(df_train.head())
print(df_valid.head())
print(df_test.head())


                                                text  label  sentiment  \
0  [state, slow, to, shut, down, weak, teacher, e...      0    -0.4404   
1  [drone, place, fresh, kill, on, step, of, whit...      1    -0.5267   
2  [report, :, majority, of, instance, of, people...      1     0.0000   
3  [sole, remain, lung, fill, with, rich, ,, sati...      1     0.7650   
4                [the, gop, 's, stockholm, syndrome]      0     0.0000   

   length_words                                            pos_seq  N  V  A  \
0             9                        [N, N, P, V, A, A, A, N, N]  4  1  2   
1             9                        [N, N, A, N, P, N, P, A, N]  5  0  2   
2            20  [N, U, N, P, N, P, N, V, N, A, P, N, N, A, P, ...  8  3  3   
3             9                        [N, N, N, N, P, A, U, A, N]  5  0  2   
4             5                                    [P, N, O, A, N]  2  0  1   

   P  O  U  
0  1  1  0  
1  2  0  0  
2  5  0  1  
3  1  0  1  
4  1  1  0  
  

In [204]:
df_train.to_csv('train_features.csv', index=False)
df_valid.to_csv('valid_features.csv', index=False)
df_test.to_csv('test_features.csv', index=False)    