In [26]:
# Load in Kaggle datasets from https://www.kaggle.com/competitions/nlp-getting-started/data
import csv
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from scipy.sparse import hstack
from joblib import dump, load

train = pd.read_csv("./Data/train.csv")
test = pd.read_csv("./Data/test.csv")
sample_submission = pd.read_csv("./Data/sample_submission.csv")

print(test.head())

   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [27]:
train_data = train.reset_index(drop=True)
test_data = test.reset_index(drop=True)

train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Data Cleaning
To better understand the structure of our data before vectorizing, we will do a basic analysis to check for null values and duplicates. We will start by analyzing the null values in the training dataset.

In [28]:
# Find length of train data
length_train = len(train_data)
print("Observations in train data:\n", length_train)

# Check for NA values in the training dataset
nan_counts_column_train = train_data.isna().sum()
print("NaN counts per column in training data:\n", nan_counts_column_train)

# What percentage of the observations have missing values for location?
location_null_percentage_train = (nan_counts_column_train['location'] / len(train_data)) * 100
print(f"Percentage of missing values for 'location' in training data: {location_null_percentage_train:.2f}%")

# What percentage of the observations have missing values for keyword?
location_null_percentage_train = (nan_counts_column_train['keyword'] / len(train_data)) * 100
print(f"Percentage of missing values for 'keyword' in training data: {location_null_percentage_train:.2f}%")


Observations in train data:
 7613
NaN counts per column in training data:
 id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
Percentage of missing values for 'location' in training data: 33.27%
Percentage of missing values for 'keyword' in training data: 0.80%


Now, we will find the missing values in the testing dataset.

In [29]:
# Find length of test data
length_test = len(test_data)
print("Observations in test data:\n", length_test)

# Check for NA values in the training dataset
nan_counts_column_test = test_data.isna().sum()
print("NaN counts per column in test data:\n", nan_counts_column_test)

# What percentage of the observations have missing values for location?
location_null_percentage_test = (nan_counts_column_test['location'] / length_test) * 100
print(f"Percentage of missing values for 'location' in test dataset: {location_null_percentage_test:.2f}%")

# What percentage of the observations have missing values for keyword?
location_null_percentage_test = (nan_counts_column_test['keyword'] / length_test) * 100
print(f"Percentage of missing values for 'keyword' in test dataset: {location_null_percentage_test:.2f}%")

Observations in test data:
 3263
NaN counts per column in test data:
 id             0
keyword       26
location    1105
text           0
dtype: int64
Percentage of missing values for 'location' in test dataset: 33.86%
Percentage of missing values for 'keyword' in test dataset: 0.80%


The following code checks for duplicated rows in the training data.

In [30]:
# Check for duplicated observations in the training data
duplicates = (train_data.duplicated())

# How mant observations are duplicates?
print(np.count_nonzero(train_data.duplicated()))

# Sanity check: show all duplicated rows
print(train_data[duplicates])

0
Empty DataFrame
Columns: [id, keyword, location, text, target]
Index: []


The following code chunk checks for duplicated rows in the test data.

In [31]:
# Check for duplicated observations in the test data
duplicates = (test_data.duplicated())

# How mant observations are duplicates?
print(np.count_nonzero(test_data.duplicated()))

# Sanity check: show all duplicated rows
print(test_data[duplicates])

0
Empty DataFrame
Columns: [id, keyword, location, text]
Index: []


Because of the smalll percentage of 'keyword' values which are missing, we will drop those observations from our testing and training datasets.

In [32]:
# Remove rows with null values in the 'keyword' column in both training and testing datasets
train_data = train_data.dropna(subset=['keyword'])
test_data = test_data.dropna(subset=['keyword'])

# Verify the number of rows after removal
print(f"Training data shape after removing null 'keyword': {train_data.shape}")
print(f"Testing data shape after removing null 'keyword': {test_data.shape}")

Training data shape after removing null 'keyword': (7552, 5)
Testing data shape after removing null 'keyword': (3237, 4)


Now, impute the missing values in the location column of the training and testing datasets with the word "missing" for consistency.

In [33]:
# impute missing values using the placeholder "missing"
clean_train = train_data.fillna("missing")
clean_test = test_data.fillna("missing")

clean_train.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


## Apply TD-IDF Vectorizer

In [34]:
# Initialize the TfidfVectorizer for text columns
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Combine the columns containing text into one before applying TD-IDF Vectorizer
data_combined = clean_train[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for training data
X_train_tfidf = tfidf_vectorizer.fit_transform(data_combined)

# Target variable (assuming 'target' column is the label for classification)
y_train_tfidf = clean_train['target']

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_train_tfidf.shape}")

# Save the TF-IDF vectorizer and feature matrix
dump(X_train_tfidf, 'X_train_tfidf.joblib')
dump(y_train_tfidf, 'y_train_tfidf.joblib')

Shape of combined feature matrix: (7552, 23168)


['y_train_tfidf.joblib']

Using the same TD-IDF vectorizer as above, vectorize the text columns in the test dataset.

In [35]:
# Combine the columns containing text into one before applying TD-IDF Vectorizer
data_combined_test = clean_test[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for test data
X_test_tfidf = tfidf_vectorizer.transform(data_combined_test)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_test_tfidf.shape}")

# Save the TF-IDF vectorizer and feature matrix
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
dump(X_test_tfidf, 'X_test_tfidf.joblib')

Shape of combined feature matrix: (3237, 23168)


['X_test_tfidf.joblib']

## Apply CountVectorizer

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer for text columns
count_vectorizer = CountVectorizer(stop_words='english')

# Combine the columns containing text into one before applying CountVectorizer
data_combined = clean_train[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for training data
X_train_count = count_vectorizer.fit_transform(data_combined)

# Target variable (assuming 'target' column is the label for classification)
y_train_count = clean_train['target']

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_train_count.shape}")

# Save the TF-IDF vectorizer and feature matrix
dump(X_train_count, 'X_train_count.joblib')
dump(y_train_count, 'y_train_count.joblib')

Shape of combined feature matrix: (7552, 23168)


['y_train_count.joblib']

In [37]:
# Combine the columns containing text into one before applying TD-IDF Vectorizer
data_combined_test = clean_test[['keyword', 'location', 'text']].agg(' '.join, axis=1)

# Vectorize 'keyword', 'location', and 'text' columns for test data
X_test_count = count_vectorizer.transform(data_combined_test)

# Check the shape of the resulting feature matrix
print(f"Shape of combined feature matrix: {X_test_count.shape}")

# Save the TF-IDF vectorizer and feature matrix
dump(count_vectorizer, 'count_vectorizer.joblib')
dump(X_test_count, 'X_test_count.joblib')

Shape of combined feature matrix: (3237, 23168)


['X_test_count.joblib']