## TF-IDF and Count Vectorizer Implementation

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from pickle import load, dump
from sklearn.metrics import accuracy_score
from joblib import load, dump

# load in saved pickle training and testing files
clean_train = pd.read_pickle("../pkl_files/clean_train.pkl")
clean_test = pd.read_pickle("../pkl_files/clean_test.pkl")

# set target as the training y values for clarity and save to repo
target = clean_train["target"]
dump(target, "../pkl_files/target.pkl")

# load in clean_train.fe and clean_test.fe time
clean_train_FE = pd.read_pickle("../pkl_files/clean_train_FE.pkl")
clean_test_FE = pd.read_pickle("../pkl_files/clean_test_FE.pkl")

# extract the new columns that need to be added to vectorized data (only needs to happen 1x for test and 1x for train)
new_feature_columns = ['capitalization_ratio', 'text_word_count', 'location_null', 'keyword_assoc_real_news', 'keyword_assoc_fake_news']  # Adjust as needed

# Extract the new features without redundancy
clean_train_FE_features = clean_train_FE[new_feature_columns]
clean_test_FE_features = clean_test_FE[new_feature_columns]

# save as pickle
dump(clean_train_FE_features, "../joblib_files/clean_train_FE_features.joblib")
dump(clean_test_FE_features, "../joblib_files/clean_test_FE_features.joblib")

['../joblib_files/clean_test_FE_features.joblib']

## Apply TF-IDF Vectorizer
Using the clean, but not feature engineered data

In [2]:
# Initialize the TfidfVectorizer for text columns
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Combine the columns containing text into one before applying TD-IDF Vectorizer
data_combined = clean_train[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for training data
X_train_tfidf = tfidf_vectorizer.fit_transform(data_combined)
joblib.dump(X_train_tfidf, '../joblib_files/X_train_tfidf.joblib') # Save for EDA

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_train_tfidf.shape}")

# Save the TF-IDF vectorizer
with open('../pkl_files/tfidf_vectorizer.pkl', 'wb') as f:
    dump(tfidf_vectorizer, f)

Shape of combined feature matrix: (7613, 23211)


Now, we will combine the feature engineered training data with the TF-IDF vectorized training data.

In [3]:
# merge with clean test fe and save
model_train_tfidf = hstack([X_train_tfidf, clean_train_FE_features])
with open('../pkl_files/model_train_tfidf.pkl', 'wb') as f:
    dump(model_train_tfidf, f)

Using the same TD-IDF vectorizer as above, vectorize the text columns in the test dataset.

In [4]:
# Combine the columns containing text into one before applying TD-IDF Vectorizer
data_combined_test = clean_test[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for test data
X_test_tfidf = tfidf_vectorizer.transform(data_combined_test)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_test_tfidf.shape}")

Shape of combined feature matrix: (3263, 23211)


Now, combine the feature engineered test data with the TF-IDF vectorized test data

In [5]:
# merge with clean test fe and save
model_test_tfidf = hstack([X_test_tfidf, clean_test_FE_features])
with open('../pkl_files/model_test_tfidf.pkl', 'wb') as f:
    dump(model_test_tfidf, f)

## Apply CountVectorizer

In [6]:
# Initialize the CountVectorizer for text columns
count_vectorizer = CountVectorizer(stop_words='english')

# Vectorize 'keyword', 'location', and 'text' columns for training data using previously generated combined text data
X_train_count = count_vectorizer.fit_transform(data_combined)
joblib.dump(X_train_count, '../joblib_files/X_train_count.joblib') # Save for EDA

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_train_count.shape}")

# Save the Count vectorizer
with open('../pkl_files/count_vectorizer.pkl', 'wb') as f:
    dump(count_vectorizer, f)

Shape of combined feature matrix: (7613, 23211)


Now, we will combine the feature engineered training data with the Count vectorized training data.

In [7]:
# do above and save as object
model_train_count = hstack([X_train_count, clean_train_FE_features])
with open('../pkl_files/model_train_count.pkl', 'wb') as f:
    dump(model_train_count, f)

In [8]:
# Vectorize 'keyword', 'location', and 'text' columns for test data using previously generated combined text data
X_test_count = count_vectorizer.transform(data_combined_test)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_test_count.shape}")

Shape of combined feature matrix: (3263, 23211)


Now, we will combine the feature engineered test data with the Count vectorized test data.

In [9]:
# do above and save as object
model_test_count = hstack([X_test_count, clean_test_FE_features])
with open('../pkl_files/model_test_count.pkl', 'wb') as f:
    dump(model_test_count, f)