In [1]:
import re
import math
import heapq
import string
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer

# 1. Binary Classification on Text Data

## a. Download the data

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
print(df_train.shape)
print(df_test.shape)

(7613, 5)
(3263, 4)


### 1. There are 7613 training data points and 3263 test data points

In [4]:
target = df_train['target']
real = [t for t in target if t == 1]

print(len(real) / target.shape[0])

0.4296597924602653


### 2. ~ 43% are of real disasters and ~57% are not of real disasters

## b. Split the training data

In [5]:
y = df_train.target
X = df_train.drop('target', axis=1)
 
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3)

## c. Preprocess the data

We converted all the words to lowercase in order to properly be processed by the lemmatizer (for example, we noticed that Dogs was not converted to Dog in the lemmatizer step)

In [6]:
X_train['text'] = X_train['text'].map(lambda s: s.lower())
X_dev['text'] = X_dev['text'].map(lambda s: s.lower())
df_test['text'] = df_test['text'].map(lambda s: s.lower())

We lemmatized the words in order to consolidate the features and combine the weights of related words

In [7]:
def lemmatize(s):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenization = nltk.word_tokenize(s)
    for w in tokenization:
        s = s.replace(w, wordnet_lemmatizer.lemmatize(w))
    return s
    
X_train['text'] = X_train['text'].map(lambda s: lemmatize(s))
X_dev['text'] = X_dev['text'].map(lambda s: lemmatize(s))
df_test['text'] = df_test['text'].map(lambda s: lemmatize(s))

We stripped punctuation to remove noisy text that does not contribute to the model effectiveness (an example is an exclamation point which could indicate an emergency but it could also indicate excitement)

In [8]:
# Source: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
X_train['text'] = X_train['text'].map(lambda s: s.translate(str.maketrans('', '', string.punctuation)))
X_dev['text'] = X_dev['text'].map(lambda s: s.translate(str.maketrans('', '', string.punctuation)))
df_test['text'] = df_test['text'].map(lambda s: s.translate(str.maketrans('', '', string.punctuation)))

We stripped the stop words in the call to CountVectorizer below

We stripped direct mentions (a) because they are mainly for individuals and do not indicate the presence of an emergency.

We stripped urls because it is hard to capture value from a URL and they likely won't make it into our vocabulary in the CountVectorizer.  Additionally, in an emergency people might not have the time to copy and paste a URL in the tweet.

In [9]:
# remove @...
X_train['text'] = X_train['text'].map(lambda s: re.sub(r'\B\@\w+', '', s))
X_dev['text'] = X_dev['text'].map(lambda s: re.sub(r'\B\@\w+', '', s))
df_test['text'] = df_test['text'].map(lambda s: re.sub(r'\B\@\w+', '', s))

# remove urls
X_train['text'] = X_train['text'].map(lambda s: re.sub(r'http\S+', '', s))
X_dev['text'] = X_dev['text'].map(lambda s: re.sub(r'http\S+', '', s))
df_test['text'] = df_test['text'].map(lambda s: re.sub(r'http\S+', '', s))

We removed numerical data because it was being captured in our CountVectorizer and we don't think it correlates with an emergency

In [10]:
# remove numerical data
X_train['text'] = X_train['text'].map(lambda s: re.sub(r'[\d-]', '', s))
X_dev['text'] = X_dev['text'].map(lambda s: re.sub(r'[\d-]', '', s))
df_test['text'] = df_test['text'].map(lambda s: re.sub(r'[\d-]', '', s))

We also decided to drop unnecessary columns because they do not contribute to the model

In [11]:
# drop unnecessary columns
X_train.drop('id', axis=1, inplace=True)
X_train.drop('keyword', axis=1, inplace=True)
X_train.drop('location', axis=1, inplace=True)

X_dev.drop('id', axis=1, inplace=True)
X_dev.drop('keyword', axis=1, inplace=True)
X_dev.drop('location', axis=1, inplace=True)

test_id_cols = df_test.id
df_test.drop('id', axis=1, inplace=True)
df_test.drop('keyword', axis=1, inplace=True)
df_test.drop('location', axis=1, inplace=True)

# save this for testing at the end
X_total = pd.concat([X_train, X_dev])
y_total = pd.concat([y_train, y_dev])

## d. Bag of words model

We decided to use M=10 as the threshold because it gave us the best results in e) and f).  Additionally, given the sample size of ~5000 points for the training set we believe if a word appears in at least 10 different tweets it is important to consider in the model.

Training set

In [12]:
vectorizer = CountVectorizer(binary=True, min_df=10, strip_accents='ascii', stop_words=['the', 'and', 'or', 'an'])
count_vectorized = vectorizer.fit_transform(X_train['text'])
columns = vectorizer.get_feature_names_out()
count_vectorized_df = pd.DataFrame(data=count_vectorized.toarray(), columns=columns)
print(f'Number of features (vectorized words) in training set: {len(count_vectorized.toarray()[0])}')
count_vectorized_df.head()

# with np.printoptions(threshold=np.inf):
#     print(vectorizer.get_feature_names_out())

Number of features (vectorized words) in training set: 1140


Unnamed: 0,aba,abc,ablaze,about,absolutely,accident,account,across,act,action,...,you,youll,your,youre,yours,yourself,youth,youtube,yr,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0


Development Set

In [13]:
count_vectorized_dev = vectorizer.transform(X_dev['text'])
count_vectorized_dev_df = pd.DataFrame(data=count_vectorized_dev.toarray(), columns=columns)
print(f'Number of features (vectorized words) in development set: {len(count_vectorized_dev.toarray()[0])}')
count_vectorized_dev_df.head()

Number of features (vectorized words) in development set: 1140


Unnamed: 0,aba,abc,ablaze,about,absolutely,accident,account,across,act,action,...,you,youll,your,youre,yours,yourself,youth,youtube,yr,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## e. Logistic regression

### i.

In [14]:
clf = LogisticRegression(penalty='none').fit(count_vectorized_df, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Set F1 Score: 

In [15]:
y_pred = clf.predict(count_vectorized_df)
print(f1_score(y_train, y_pred))

0.8628687232605269


Development Set F1 Score:

In [16]:
y_pred = clf.predict(count_vectorized_dev_df)
print(f1_score(y_dev, y_pred))

0.731083844580777


It seems like it is overfitting due to the noticeable difference in performance between the training and development sets.

### ii.

In [17]:
clf_l1 = LogisticRegression(penalty='l1', solver='liblinear').fit(count_vectorized_df, y_train)

Training Set F1 Score: 

In [18]:
y_pred = clf_l1.predict(count_vectorized_df)
print(f1_score(y_train, y_pred))

0.8230376219228983


Development Set F1 Score:

In [19]:
y_pred = clf_l1.predict(count_vectorized_dev_df)
print(f1_score(y_dev, y_pred))

0.7482327351821643


### iii.

In [20]:
clf_l2 = LogisticRegression(penalty='l2').fit(count_vectorized_df, y_train)

Training Set F1 Score: 

In [21]:
y_pred = clf_l2.predict(count_vectorized_df)
print(f1_score(y_train, y_pred))

0.830484988452656


Development Set F1 Score:

In [22]:
y_pred = clf_l2.predict(count_vectorized_dev_df)
print(f1_score(y_dev, y_pred))

0.7508055853920516


### iv.

The first classifier performed best on the training dataset but it seems to be slightly overfitting since the first classfier performs the worst on the development dataset.  However, the L2 classifier performed the best on the develompent dataset which indicates the regularization did in fact reduce overfitting.  Additionally, the L2 classifier performed well enough on the training dataset as well which indicates this classifier performed the best overall.

### v.

In [23]:
sorted_indices = sorted(range(len(clf_l1.coef_[0])), key=lambda k: clf_l1.coef_[0][k])
largest_indices = sorted_indices[:3]
smallest_indices = sorted_indices[-3:]

print(f"Most important words indicating a tweet is an emergency: {columns[smallest_indices]}")
print(f"Most important words indicating a tweet is NOT an emergency: {columns[largest_indices]}")

Most important words indicating a tweet is an emergency: ['bombing' 'hiroshima' 'wildfire']
Most important words indicating a tweet is NOT an emergency: ['wedding' 'ebay' 'finally']


## f. Bernoulli Naive Bayes

In [24]:
# Source: lecture notes - https://github.com/kuleshov/cornell-cs5785-2022-applied-ml/blob/main/lecture-notes/lecture6-naive-bayes.ipynb
def get_naive_bayes_theta_vals(X_df, y_df, K):
    n, d = X_df.shape
    alpha = 1

    psis, phis = np.zeros([K, d]), np.zeros([K])

    y_arr = np.array(y_df)
    X_arr = X_df.to_numpy()

    for k in range(K):
        X_k = X_arr[y_arr == k]
        psis[k] = (np.sum(X_k, axis=0) + alpha) / (X_k.shape[0] + 2*alpha)   # Laplace smoothing
        phis[k] = X_k.shape[0] / float(n)
        
    return psis, phis

In [25]:
# Source: lecture notes - https://github.com/kuleshov/cornell-cs5785-2022-applied-ml/blob/main/lecture-notes/lecture6-naive-bayes.ipynb
def naive_bayes_predict(x, psis, phis, K):
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    psis = psis.clip(1e-14, 1-1e-14)
    
    logpy = np.log(phis).reshape([K, 1])
    logpxy = x * np.log(psis) + (1 - x) * np.log(1 - psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

In [26]:
# train
psis, phis = get_naive_bayes_theta_vals(count_vectorized_df, y_train, 2)

Training Set F1 Score

In [27]:
idx, logpyx = naive_bayes_predict(np.array(count_vectorized_df), psis, phis, 2)
print(f1_score(y_train, idx))

0.7702417272940624


Development Set F1 Score

In [28]:
idx, logpyx = naive_bayes_predict(np.array(count_vectorized_dev_df), psis, phis, 2)
print(f1_score(y_dev, idx))

0.7406181015452538


## g. Model comparison

The L2-regularized logistic regression classifier and the Naive Bayes classifier both performed the best in determining whether a tweet was an emergency or not.  

The pros of using a discriminative model is that it is more robust to outliers, requires less data, and is computationally cheaper.  The cons of using a discriminative model include it is more difficult to interpret than generative models.

The pros of using a generative model is that it is good at unsupervised learning and it gives us a good idea of the underlying data distribution.  A con is that it is more computationally expensive. 

The assumption of Naive Bayes is that each feature (word in this example) is independent of one another.  From our results, it looks like the Naive Bayes classifier and the Logistic Regression classifier performs similary so that confirms it is valid and efficient to use Bernoulli Naive Bayes classifier for natural language texts.

## h. N-gram model

We decided to use M=5 because it is half of our optimal value from the 1-gram version (M=10) and the frequency of pairs of words is lower than the frequency of single words so wanted the M-value to be low enough to capture a good amount of the 2-grams.

Training set

In [29]:
vectorizer = CountVectorizer(binary=True, ngram_range=(1,2), min_df=5, strip_accents='ascii', stop_words=['the', 'and', 'or', 'an'])
count_vectorized = vectorizer.fit_transform(X_train['text'])
columns = vectorizer.get_feature_names_out()
count_vectorized_ngram_df = pd.DataFrame(data=count_vectorized.toarray(), columns=columns)

count_vectorized_row = count_vectorized.toarray()[0]
print(f'Number of total features in training set: {len(count_vectorized_row)}')
two_gram_features = [c for c in columns if len(c.split()) > 1]
print(f'Number of 2-gram features in training set: {len(two_gram_features)}')
print(f'Number of 1-gram features in training set: {len([c for c in columns if len(c.split()) == 1])}')

print(f'10 2-gram features: {two_gram_features[:10]}')

count_vectorized_ngram_df.head()

# with np.printoptions(threshold=np.inf):
#     print(vectorizer.get_feature_names_out())

Number of total features in training set: 3228
Number of 2-gram features in training set: 1250
Number of 1-gram features in training set: 1978
10 2-gram features: ['aba woman', 'abc news', 'able to', 'about it', 'about to', 'about trapped', 'according to', 'account of', 'action year', 'added video']


Unnamed: 0,aba,aba woman,abandoned,abc,abc news,abcnews,ablaze,able,able to,about,...,youth,youth saved,youtube,youtube playlist,youtube video,youve,yr,yr old,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Development Set

In [30]:
count_vectorized_dev = vectorizer.transform(X_dev['text'])
count_vectorized_ngram_dev_df = pd.DataFrame(data=count_vectorized_dev.toarray(), columns=columns)

count_vectorized_row = count_vectorized.toarray()[0]
print(f'Number of total features in training set: {len(count_vectorized_row)}')
two_gram_features = [c for c in columns if len(c.split()) > 1]
print(f'Number of 2-gram features in training set: {len(two_gram_features)}')
print(f'Number of 1-gram features in training set: {len([c for c in columns if len(c.split()) == 1])}')

print(f'10 2-gram features: {two_gram_features[:10]}')

count_vectorized_ngram_dev_df.head()

Number of total features in training set: 3228
Number of 2-gram features in training set: 1250
Number of 1-gram features in training set: 1978
10 2-gram features: ['aba woman', 'abc news', 'able to', 'about it', 'about to', 'about trapped', 'according to', 'account of', 'action year', 'added video']


Unnamed: 0,aba,aba woman,abandoned,abc,abc news,abcnews,ablaze,able,able to,about,...,youth,youth saved,youtube,youtube playlist,youtube video,youve,yr,yr old,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Logistic Regression

In [31]:
clf_l2 = LogisticRegression(penalty='l2').fit(count_vectorized_ngram_df, y_train)

Training Set F1 Score: 

In [32]:
y_pred = clf_l2.predict(count_vectorized_ngram_df)
print(f1_score(y_train, y_pred))

0.8953117888029131


Development Set F1 Score:

In [33]:
y_pred = clf_l2.predict(count_vectorized_ngram_dev_df)
print(f1_score(y_dev, y_pred))

0.7506760411032991


#### Naive Bayes

In [34]:
# train
psis, phis = get_naive_bayes_theta_vals(count_vectorized_ngram_df, y_train, 2)

Training Set F1 Score: 

In [35]:
idx, logpyx = naive_bayes_predict(np.array(count_vectorized_ngram_df), psis, phis, 2)
print(f1_score(y_train, idx))

0.7755


Development Set F1 Score:

In [36]:
idx, logpyx = naive_bayes_predict(np.array(count_vectorized_ngram_dev_df), psis, phis, 2)
print(f1_score(y_dev, idx))

0.7397590361445783


For logistic regression, the model was able to predict the training set more accurately with the 2-gram model compared to the bag of words model and it performed similarly to the bag of words model for the development set.

For Naive Bayes, the 2-gram model performed similarly on the training and develpoment sets compared to bag of words.

This implies the Naive Bayes assumption stands true because words contribute independently to the performance of the model and as they became conditionally dependent, the results did not change significantly.

Therefore, Naive Bayes generally performs well on this task because Naive Bayes assumes each word is independent of one another.

## i. Determine performance with the test set

We decided to use L2-regularized Logistic Regression with bag of words

In [37]:
vectorizer = CountVectorizer(binary=True, min_df=10, strip_accents='ascii', stop_words=['the', 'and', 'or', 'an'])
count_vectorized = vectorizer.fit_transform(X_total['text'])
columns = vectorizer.get_feature_names_out()
count_vectorized_total_df = pd.DataFrame(data=count_vectorized.toarray(), columns=columns)
count_vectorized_total_df.head()

Unnamed: 0,aba,abandoned,abc,ablaze,able,about,absolutely,accident,according,account,...,young,your,youre,yours,yourself,youth,youtube,yr,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [39]:
count_vectorized_test = vectorizer.transform(df_test['text'])
count_vectorized_test_df = pd.DataFrame(data=count_vectorized_test.toarray(), columns=columns)
count_vectorized_test_df.head()

Unnamed: 0,aba,abandoned,abc,ablaze,able,about,absolutely,accident,according,account,...,young,your,youre,yours,yourself,youth,youtube,yr,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
clf_l2 = LogisticRegression(penalty='l2').fit(count_vectorized_total_df, y_total)

In [41]:
# predict
y_pred = clf_l2.predict(count_vectorized_test_df)
y_pred_final = pd.DataFrame(data = y_pred, index = test_id_cols, columns = ['target'])

In [42]:
# create CSV with predictions
y_pred_final.to_csv('predictions.csv')

When we submitted to Kaggle, we got a score of .79466

This was higher than we expected because it is a better score than our tests on the development set.  However, we trained using both the traning and development sets so it makes sense that our Kaggle predictions were higher.