## Preliminary Data Loading and Cleaning
After running this file, run 'Feature_Engineering.ipynb' before fitting the vectorizers.

In [16]:
# Load in Kaggle datasets from https://www.kaggle.com/competitions/nlp-getting-started/data
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from joblib import dump, load
from sklearn.metrics import accuracy_score

train = pd.read_csv("./Data/train.csv")
test = pd.read_csv("./Data/test.csv")
sample_submission = pd.read_csv("./Data/sample_submission.csv")

print(test.head())

   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [17]:
train_data = train.reset_index(drop=True)
test_data = test.reset_index(drop=True)

train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Data Cleaning
To better understand the structure of our data before vectorizing, we will do a basic analysis to check for null values and duplicates. We will start by analyzing the null values in the training dataset.

In [18]:
# Find length of train data
length_train = len(train_data)
print("Observations in train data:\n", length_train)

# Check for NA values in the training dataset
nan_counts_column_train = train_data.isna().sum()
print("NaN counts per column in training data:\n", nan_counts_column_train)

# What percentage of the observations have missing values for location?
location_null_percentage_train = (nan_counts_column_train['location'] / len(train_data)) * 100
print(f"Percentage of missing values for 'location' in training data: {location_null_percentage_train:.2f}%")

# What percentage of the observations have missing values for keyword?
location_null_percentage_train = (nan_counts_column_train['keyword'] / len(train_data)) * 100
print(f"Percentage of missing values for 'keyword' in training data: {location_null_percentage_train:.2f}%")


Observations in train data:
 7613
NaN counts per column in training data:
 id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
Percentage of missing values for 'location' in training data: 33.27%
Percentage of missing values for 'keyword' in training data: 0.80%


Now, we will find the missing values in the testing dataset.

In [19]:
# Find length of test data
length_test = len(test_data)
print("Observations in test data:\n", length_test)

# Check for NA values in the training dataset
nan_counts_column_test = test_data.isna().sum()
print("NaN counts per column in test data:\n", nan_counts_column_test)

# What percentage of the observations have missing values for location?
location_null_percentage_test = (nan_counts_column_test['location'] / length_test) * 100
print(f"Percentage of missing values for 'location' in test dataset: {location_null_percentage_test:.2f}%")

# What percentage of the observations have missing values for keyword?
location_null_percentage_test = (nan_counts_column_test['keyword'] / length_test) * 100
print(f"Percentage of missing values for 'keyword' in test dataset: {location_null_percentage_test:.2f}%")

Observations in test data:
 3263
NaN counts per column in test data:
 id             0
keyword       26
location    1105
text           0
dtype: int64
Percentage of missing values for 'location' in test dataset: 33.86%
Percentage of missing values for 'keyword' in test dataset: 0.80%


The following code checks for duplicated rows in the training data.

In [20]:
# Check for duplicated observations in the training data
duplicates = (train_data.duplicated())

# How mant observations are duplicates?
print(np.count_nonzero(train_data.duplicated()))

# Sanity check: show all duplicated rows
print(train_data[duplicates])

0
Empty DataFrame
Columns: [id, keyword, location, text, target]
Index: []


The following code chunk checks for duplicated rows in the test data.

In [21]:
# Check for duplicated observations in the test data
duplicates = (test_data.duplicated())

# How mant observations are duplicates?
print(np.count_nonzero(test_data.duplicated()))

# Sanity check: show all duplicated rows
print(test_data[duplicates])

0
Empty DataFrame
Columns: [id, keyword, location, text]
Index: []


Now, impute the missing values in the location column of the training and testing datasets with the word "missing" for consistency.

In [22]:
# impute missing values using the placeholder "missing"
clean_train = train_data.fillna("missing")
clean_test = test_data.fillna("missing")

clean_train.head()

# Save clean train and test data as objects (pickle files)
clean_train.to_pickle("clean_train.pkl")
clean_test.to_pickle("clean_test.pkl")