## Feature Engineering

In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from pickle import load, dump
from sklearn.metrics import accuracy_score

# load in clean test and clean train
clean_train = pd.read_pickle("clean_train.pkl")
clean_test = pd.read_pickle("clean_test.pkl")

Now, we want to add more features for enhanced predictability. To start, we will add a binary predictor to indicate whether the location is "null" in the corresponding observation.

In [22]:
# add binary predictor to indicate whether location is missing
clean_train['location_null'] = clean_train['location'].isnull().astype(int)
clean_test['location_null'] = clean_test['location'].isnull().astype(int)

In [23]:
# how many words in a tweet
def count_words(text):
    if isinstance(text, str):  # Check if the entry is a string
        return len(text.split())  # Split the string by whitespace and count the parts
    else:
        return 0 
    

clean_train['text_word_count'] = clean_train['text'].apply(count_words)
clean_test['text_word_count'] = clean_test['text'].apply(count_words)

In [24]:
# Binary Capitalization Ratio Function
def binary_capitalization_ratio(text, threshold=0.2):
    if isinstance(text, str) and len(text) > 0:  # Check if it's a string and not empty
        uppercase_count = sum(1 for char in text if char.isupper())  # Count uppercase characters
        total_count = sum(1 for char in text if char.isalpha())  # Count total alphabetic characters
        ratio = uppercase_count / total_count if total_count > 0 else 0  # Calculate ratio
        return 1 if ratio > threshold else 0  # Convert to binary based on threshold
    else:
        return 0  # Handle non-string or empty cases

# Applying the function to create a binary column in both training and testing datasets
clean_train['capitalization_binary'] = clean_train['text'].apply(lambda x: binary_capitalization_ratio(x, threshold=0.2))
clean_test['capitalization_binary'] = clean_test['text'].apply(lambda x: binary_capitalization_ratio(x, threshold=0.2))

# Displaying the results
print(clean_train[['text', 'capitalization_binary']].head())
print(clean_test[['text', 'capitalization_binary']].head())

                                                text  capitalization_binary
0  Our Deeds are the Reason of this #earthquake M...                      0
1             Forest fire near La Ronge Sask. Canada                      0
2  All residents asked to 'shelter in place' are ...                      0
3  13,000 people receive #wildfires evacuation or...                      0
4  Just got sent this photo from Ruby #Alaska as ...                      0
                                                text  capitalization_binary
0                 Just happened a terrible car crash                      0
1  Heard about #earthquake is different cities, s...                      0
2  there is a forest fire at spot pond, geese are...                      0
3           Apocalypse lighting. #Spokane #wildfires                      0
4      Typhoon Soudelor kills 28 in China and Taiwan                      0


In [25]:
# other features here

Now, save these clean test and train datasets with the new features included. This is done below. 

In [26]:
# save after all features have been added
clean_train.to_pickle("clean_train_FE.pkl")
clean_test.to_pickle("clean_test_FE.pkl")