In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as napi # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [26]:
from nltk.stem import PorterStemmer, LancasterStemmer

In [2]:
# Input data files are available in the "../input/" directory.
print("loading data")
sample_submission = pd.read_csv("sample_submission.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

X = train.iloc[:, 1:-1]
y = train["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X.head()

loading data


Unnamed: 0,keyword,location,text
0,,,Our Deeds are the Reason of this #earthquake M...
1,,,Forest fire near La Ronge Sask. Canada
2,,,All residents asked to 'shelter in place' are ...
3,,,"13,000 people receive #wildfires evacuation or..."
4,,,Just got sent this photo from Ruby #Alaska as ...


In [3]:
# keyword dictionary
# Could instead use a CountVectorizer
keyword_col = X_train["keyword"]
keyword_not_null = keyword_col[keyword_col.notnull()]
keyword_arr = keyword_not_null.values
# keyword_arr = keyword_arr[keyword_arr.notnull()]
# print(keyword_arr.size)
keyword_arr = napi.unique(keyword_arr)
# print(keyword_arr.size)
KEYWORD_ARRAY_SIZE = keyword_arr.size

# TODO: Convert keywords to feature vectors
keyword_features = pd.get_dummies(X_train.iloc[:,0]).values
# print("keyword_features shape: ", keyword_features.shape)
print("length of keyword feature vector", KEYWORD_ARRAY_SIZE)  # 221

'''
keyword_to_vector
Description: one-hot encode a keyword
Inputs: key -- keyword to convert (string)
         keyword_arr -- "dictionary" to be used (sorted array of strings)
                        Always use keyword_arr in this project
Returns: a feature vector (arrat of ints, 0 or 1)
'''
def keyword_to_vector(key, keyword_arr):
    if pd.isna(key):
        return napi.zeros(len(keyword_arr))
    feature_vect_bool = (keyword_arr == key)
    feature_vect = 1 * feature_vect_bool
    return feature_vect


# CountVectorizer Approach:
# keywords = X_train['text'].tolist()
# keyword_cv = CountVectorizer()
# keyword_vect = keyword_cv.fit_transform(keywords).toarray(float)    
# print(keyword_to_vector("ablaze", keyword_arr))

# Skip this part now (ignore location)
# # location dictionary
# location_arr = X_train["location"][X_train["location"].notnull()].values
# # location_arr = location_arr[location_arr.notnull()]
# print(location_arr.size)
# location_arr = napi.unique(location_arr)
# print(location_arr.size)
# LOCATION_ARRAY_SIZE = location_arr.size
# # print(location_arr)


length of keyword feature vector 221


In [45]:
# NOTE: Try to decrease number of features associated with keywords
keyword_dict = {}
# Right now we are using the entire training set (including train, cv, test) to do this;
# This is NOT good practice, but we may leave it because we are lazy.
keyword_full_arr = keyword_col = train[["keyword","target"]].values
stemmer = PorterStemmer() # could also try other stemmers
# print(keyword_full_arr)
for pair in keyword_full_arr:
    if pd.isna(pair[0]):
        stemmed_key = napi.nan
    else:
        stemmed_key = stemmer.stem(pair[0])
    
    if stemmed_key not in keyword_dict:
        keyword_dict[stemmed_key] = [0,0]
    keyword_dict[stemmed_key][0] += pair[1]
    keyword_dict[stemmed_key][1] += 1
    
# print(keyword_dict)
# print(len(keyword_dict))

# print(keyword_dict.values())

# Next, transform keywords to numerical values:
numerical_keyword_features = []
for keyword in X_train["keyword"].values:
    if pd.isna(keyword):
        key = napi.nan
    else:
        key = stemmer.stem(keyword)
    
    numerical_keyword_features.append(keyword_dict[key][0]/keyword_dict[key][1])

numerical_keyword_features = (napi.array(numerical_keyword_features)).reshape(-1,1)
print(numerical_keyword_features.shape)

(6851, 1)


In [38]:

# TODO: Build a dictionary on text
sentences = X_train['text'].tolist()
cv = CountVectorizer(min_df=0.0008)
text_features = cv.fit_transform(sentences).toarray(float)
# print(len(sentences))
# print(cv.vocabulary_)
# print(text_features[100])
print("length of text feature vector: ", len(cv.get_feature_names()))


length of text feature vector:  2175


In [46]:

# merge all the features:
print(keyword_features.shape, text_features.shape)
All_features = napi.concatenate((numerical_keyword_features, text_features), axis=1)
print(All_features.shape)

(6851, 221) (6851, 2175)
(6851, 2176)


In [47]:
# Apply PCA if necessary
print("Doing PCA...")
pca = PCA(0.95)
pca.fit(All_features)
PCA(copy=True, iterated_power='auto', random_state=42, svd_solver='auto', tol=0.0)

train_X_pca = pca.transform(All_features)
print("n components: ", pca.n_components_)

Doing PCA...
n components:  1151


In [49]:
'''
Without PCA:
'fit_time': array([48.45315099, 49.18679333, 49.14396572]), 
'score_time': array([22.64499092, 23.42073154, 23.34550667]), 
'test_score': array([0.64055591, 0.68082524, 0.67186563]), 
'train_score': array([0.70620605, 0.69862595, 0.71303824])}
'''

'''
with PCA
using numerical_keyword_features
svm_C = 1
svm_kernel = "rbf"
svm_gamma = "scale" # "auto" or "scale"

'fit_time': array([21.40067339, 21.66516542, 21.43422079]), 
'score_time': array([ 9.91457796, 10.0454669 ,  9.95311713]), 
'test_score': array([0.72622478, 0.75140607, 0.72983644]), 
'train_score': array([0.7757987 , 0.77254902, 0.77514124])
'''
svm_C = 1
svm_kernel = "rbf"
svm_gamma = "scale" # "auto" or "scale"
svm_1 = SVC(C=svm_C, kernel=svm_kernel, gamma=svm_gamma)
score_1 = cross_validate(svm_1, train_X_pca, y_train, cv=3, scoring="f1")
print(score_1)



{'fit_time': array([21.99435639, 22.20287919, 22.42193151]), 'score_time': array([10.21057177, 10.32464886, 10.29132986]), 'test_score': array([0.72190035, 0.74901297, 0.72675737]), 'train_score': array([0.77115275, 0.7673699 , 0.77119365])}
