## TF-IDF and Count Vectorizer Implementation

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from pickle import load
from sklearn.metrics import accuracy_score

# load in saved pickle training and testing files
with open('clean_train.pkl', 'rb') as f:
    clean_train = load(f)

with open('clean_test.pkl', 'rb') as f:
    clean_test = load(f)

# set target as the training y values for clarity and save to repo
target = clean_train["target"]
dump(target, 'target.pkl')

# load in clean_train.fe and clean_test.fe 1 time


## Apply TF-IDF Vectorizer


In [None]:
# Initialize the TfidfVectorizer for text columns
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Combine the columns containing text into one before applying TD-IDF Vectorizer
data_combined = clean_train[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for training data
X_train_tfidf = tfidf_vectorizer.fit_transform(data_combined)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_train_tfidf.shape}")

# Save the TF-IDF vectorizer and feature matrix
dump(X_train_tfidf, 'X_train_tfidf.joblib')

Now, we will combine the feature engineered training data with the TF-IDF vectorized training data.

In [None]:
# merge with clean test fe and save

Using the same TD-IDF vectorizer as above, vectorize the text columns in the test dataset.

In [None]:
# Combine the columns containing text into one before applying TD-IDF Vectorizer
data_combined_test = clean_test[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for test data
X_test_tfidf = tfidf_vectorizer.transform(data_combined_test)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_test_tfidf.shape}")

# Save the TF-IDF vectorizer and feature matrix
dump(X_test_tfidf, 'X_test_tfidf.joblib')

Now, combine the feature engineered test data with the TF-IDF vectorized test data

In [None]:
# merge with clean test fe and save

## Apply CountVectorizer

In [None]:
# Initialize the CountVectorizer for text columns
count_vectorizer = CountVectorizer(stop_words='english')

# Vectorize 'keyword', 'location', and 'text' columns for training data using previously generated combined text data
X_train_count = count_vectorizer.fit_transform(data_combined)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_train_count.shape}")

# Save the TF-IDF vectorizer and feature matrix
dump(X_train_count, 'X_train_count.joblib')

Now, we will combine the feature engineered training data with the Count vectorized training data.

In [None]:
# do above and save as object

In [None]:
# Vectorize 'keyword', 'location', and 'text' columns for test data using previously generated combined text data
X_test_count = count_vectorizer.transform(data_combined_test)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_test_count.shape}")

# Save the Count vectorizer and feature matrix
dump(X_test_count, 'X_test_count.joblib')

Now, we will combine the feature engineered test data with the Count vectorized test data.

In [None]:
# do above and save as object