## Feature Engineering

In [22]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from pickle import load, dump
from sklearn.metrics import accuracy_score

# load in clean test and clean train
clean_train = pd.read_pickle("clean_train.pkl")
clean_test = pd.read_pickle("clean_test.pkl")

Now, we want to add more features for enhanced predictability. To start, we will add a binary predictor to indicate whether the location is "null" in the corresponding observation.

In [23]:
# add binary predictor to indicate whether location is missing
clean_train['location_null'] = clean_train['location'].isnull().astype(int)
clean_test['location_null'] = clean_test['location'].isnull().astype(int)

In [24]:
# how many words in a tweet
def count_words(text):
    if isinstance(text, str):  # Check if the entry is a string
        return len(text.split())  # Split the string by whitespace and count the parts
    else:
        return 0 
    

clean_train['text_word_count'] = clean_train['text'].apply(count_words)
clean_test['text_word_count'] = clean_test['text'].apply(count_words)

In [25]:
# Capitalization ratio
def capitalization_ratio(text):
    if isinstance(text, str) and len(text) > 0:  # Check if it's a string and not empty
        uppercase_count = sum(1 for char in text if char.isupper())  # Count uppercase characters
        total_count = sum(1 for char in text if char.isalpha())  # Count total alphabetic characters
        return uppercase_count / total_count if total_count > 0 else 0  # Return ratio
    else:
        return 0

clean_train['capitalization_ratio'] = clean_train['text'].apply(count_words)
clean_test['capitalization_ratio'] = clean_test['text'].apply(count_words)

In [26]:
# add binary indicator to indicate if text contains top keywords associated with real tweets

words_assoc_real_news = pd.read_pickle("words_assoc_real_news.pkl")
words_assoc_fake_news = pd.read_pickle("words_assoc_fake_news.pkl")

def check_keyword_match_real(keyword):
    return 1 if keyword in words_assoc_real_news else 0

def check_keyword_match_fake(keyword):
    return 1 if keyword in words_assoc_fake_news else 0

clean_train['keyword_assoc_real_news'] = clean_train['keyword'].apply(check_keyword_match_real)
clean_train['keyword_assoc_fake_news'] = clean_train['keyword'].apply(check_keyword_match_fake)

clean_test['keyword_assoc_real_news'] = clean_test['keyword'].apply(check_keyword_match_real)
clean_test['keyword_assoc_fake_news'] = clean_test['keyword'].apply(check_keyword_match_fake)

print(clean_train['keyword_assoc_real_news'].sum())
print(clean_train['keyword_assoc_fake_news'].sum())

print(clean_test['keyword_assoc_real_news'].sum())
print(clean_test['keyword_assoc_fake_news'].sum())

1145
1990
447
843


Now, save these clean test and train datasets with the new features included. This is done below. 

In [27]:
# save after all features have been added
clean_train.to_pickle("clean_train_FE.pkl")
clean_test.to_pickle("clean_test_FE.pkl")