In [18]:
# e.g. if using google colab import drive, uncomment lines below
# from google.colab import drive
# drive.mount('/content/drive')

In [19]:
# import packages

import os
import re
import sklearn
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import hstack
from scipy.sparse import vstack

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression as sk_OLS
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


import torch
import torch.nn.functional as F
import math

# Part (a): Download the data

In [20]:
#====================================================#
# YOUR CODE HERE:
#   Import train and test csv files.
#   You should use the pd.read_csv function.
#   You should set the index_col parameter to equal 'id'.
#====================================================#

train_data = pd.read_csv('./train.csv', index_col='id')
test_data  = pd.read_csv('./test.csv', index_col='id')

#====================================================#
# END YOUR CODE
#====================================================#


In [21]:
#====================================================#
# YOUR CODE HERE:
#   Get the index values for X_train and y_train.
#   Get the data values for X_train and y_train.
#   Get the index values for X_test.
#   Get the index values for y_test.
#====================================================#

# get train indices
X_train_id = train_data.index.to_list()
y_train_id = train_data.index.to_list()
# get train data
X_train    = train_data['text']
y_train    = train_data['target']

# get test indices
X_test_id  = test_data.index.to_list()
# get test data
X_test     = test_data['text']

#====================================================#
# END YOUR CODE
#====================================================#

print(f"Train Data Shape: {X_train.shape}")
print(f"Test Data Shape: {X_test.shape}")

print(f"Number of labels = 1 in train dataset as percentage: {((y_train == 1).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 0 in train dataset as percentage: {((y_train == 0).sum() / (X_train.shape[0])) * 100:0.2f}%")

Train Data Shape: (7613,)
Test Data Shape: (3263,)
Number of labels = 1 in train dataset as percentage: 42.97%
Number of labels = 0 in train dataset as percentage: 57.03%


### Part (a), Question 1: How many training and test data points are there?

### Answer:
There are 7613 training data points and 3263 test data points.

### Part (a), Question 2: what percentage of the training tweets are of real disasters, and what percentage is not?

### Answer:
In the training data, 42.97% of tweets are of real disasters, and 57.03% of tweets are not.

# Part (b): Split the training data.

In [22]:
#====================================================#
# YOUR CODE HERE:
#  You should use the sklearn.model_selection.train_test_split
#     parameter to perform the train/development split
#   Set the test_size to 0.30.
#   Set the random_stat parameter to 42.
#====================================================#

x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.3, random_state= 42)
X_train_orig   = x_train_split
X_develop_orig = x_test_split
y_train_orig   = y_train_split
y_develop_orig = y_test_split



#====================================================#
# END YOUR CODE
#====================================================#

# Part (c): Preprocess the data.

In [23]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function to obtain the pre-processed
#  X_train and X_develop
#  Note that we suggest you to do every sub-question in a dedicated Python
#  function to make the code more structured and less error-prone.
#  With a function, you can clearly test each part when you encounter an error.
#  You can also create your own simple input data (e.g. just one sample) to
#  test the correctness of a function.
#========================================================================#


nltk.download("stopwords")
nltk.download("punkt")

def pre_process(data):
    pped_data = pd.DataFrame(data)
    
    # Remove URL and @
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    pped_data['text'] = pped_data['text'].str.replace(url_pattern, '', regex=True)
    pped_data['text'] = pped_data['text'].str.replace(r'@\w+', '', regex=True)
    
    # Remove Punctuation
    pped_data['text'] = pped_data['text'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    # Convert to Lower Case
    pped_data['text'] = pped_data['text'].str.lower()
    
    # Remove Stop Word
    stop_words = set(stopwords.words('english'))
    pped_data['text'] = pped_data['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
    
    return pped_data

# Assuming X_train_orig and X_develop_orig are defined somewhere above in your code
X_train_preproc   = pre_process(X_train_orig)
X_develop_preproc = pre_process(X_develop_orig)


# get the preprocessed data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gareth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/gareth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Part (d): Bag of words model.

In [24]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function to obtain X_train and X_develop,
#  whose "text" feature only contains 1 and 0 to indicate whether a word is in
#  the tweet. At this point, you should only be constructing feature vectors
#  for each data point using the text in the “text” column.
#  You should ignore the “keyword” and “location” columns for now.
#========================================================================#

def bag_of_word(data, vectorizer=None):
    if vectorizer is None:
        vectorizer = CountVectorizer(max_features=5000, binary=True, min_df=5)
        bow_list = vectorizer.fit_transform(data['text'])
    else:
        bow_list = vectorizer.transform(data['text'])
    return bow_list, vectorizer # Feel free to change the variable name
  
# get the featurized data
X_train, vectorizer_train = bag_of_word(X_train_preproc)
X_develop, _ = bag_of_word(X_develop_preproc, vectorizer_train)


print(X_train.shape)
print(X_develop.shape)
print(y_train_orig.shape)
print(y_develop_orig.shape)


(5329, 1929)
(2284, 1929)
(5329,)
(2284,)


# Part (e): Logistic regression.

In [25]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  without regularization terms.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#

def logistic_without_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    model = LogisticRegression(penalty=None, solver='saga', max_iter=1000000)

    # fit your model to the train data
    model.fit(X_train, Y_train)

    # generate your prediction for the training set
    y_train_no_reg = model.predict(X_train)
    
    # generate your prediction for the development set
    y_develop_no_reg = model.predict(X_develop)
    
    return y_train_no_reg, y_develop_no_reg

# get the F1 train and develop scores
y_train_no_reg, y_develop_no_reg = logistic_without_regularization(X_train,y_train_orig,X_develop,y_develop_orig)
F1_train_no_reg = f1_score(y_train_orig, y_train_no_reg)
F1_develop_no_reg = f1_score(y_develop_orig, y_develop_no_reg)

print(f"F1 for training set: {F1_train_no_reg:.2f}")
print(f"F1 for development set: {F1_develop_no_reg:.2f}")


F1 for training set: 0.96
F1 for development set: 0.66


In [26]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  with L1 regularization.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#

def logistic_L1_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=10000)
    # then fit your model to the train data
    model.fit(X_train, Y_train)
    # then generate your prediction for the training set
    y_train_L1_reg = model.predict(X_train)
    # then generate your prediction for the development set
    y_develop_L1_reg = model.predict(X_develop)
    # then get the weight vector 
    L1_weight = model.coef_
    L1_intercept = model.intercept_

    #========================================================================#
    #  This function should train a logistic regression model without
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_L1_reg, y_develop_L1_reg, L1_weight, L1_intercept

# get the F1 train and develop scores
y_train_L1_reg, y_develop_L1_reg, L1_weight, L1_intercept = logistic_L1_regularization(X_train,y_train_orig,X_develop,y_develop_orig)
F1_train_L1_reg = sklearn.metrics.f1_score(y_train_orig, y_train_L1_reg)
F1_develop_L1_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_L1_reg)

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_L1_reg:.2f}")
print(f"F1 for development set: {F1_develop_L1_reg:.2f}")

F1 for training set: 0.83
F1 for development set: 0.73


In [27]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  with L2 regularization.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#
def logistic_L2_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    model = LogisticRegression(penalty='l2', solver='saga', max_iter=10000)
    # then fit your model to the train data
    model.fit(X_train, Y_train)
    # then generate your prediction for the training set
    y_train_L2_reg = model.predict(X_train)
    # then generate your prediction for the development set
    y_develop_L2_reg = model.predict(X_develop)
    #========================================================================#
    #  This function should train a logistic regression model without
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_L2_reg, y_develop_L2_reg

# get the F1 train and develop scores
y_train_L2_reg, y_develop_L2_reg = logistic_L2_regularization(X_train,y_train_orig,X_develop,y_develop_orig)
F1_train_L2_reg = sklearn.metrics.f1_score(y_train_orig, y_train_L2_reg)
F1_develop_L2_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_L2_reg)

# print the F1 train and develop scores

print(f"F1 for training set: {F1_train_L2_reg:.2f}")
print(f"F1 for development set: {F1_develop_L2_reg:.2f}")

F1 for training set: 0.86
F1 for development set: 0.75


### Which one of the three classifiers performed the best on your training and development set? Did you observe any overfitting and did regularization help reduce it? Support your answers with the classifier performance you got.

### Answer:

In the training data, the None-regularization model performed the best. Its F1 score for training set is 0.97.
In the development data, the L2-regularization model performed the best. Its F1 score for development set is 0.75.

Based on my view, I think there exists the problem of overfitting in the None-regularization model. Because its F1 score reached 0.96 in the training set but performed poorly in the development set, only 0.66. And with the help of the regularization, the performance in the development set enhanced. In the development set, with L1 regularization, the F1 score reached 0.73 and with the L2 regularization, the F1 score reached 0.75. So, the regularization did help reduce the overfitting.

### Inspect the weight vector of the classifier with L1 regularization (in other words, look at the θ you got after training). You can access the weight vector of the trained model using the coef_attribute of a LogisticRegression instance. What are the most important words for deciding whether a tweet is about a real disaster or not? You might need to run some code (feel free to insert a code cell below).

In [28]:
weights = L1_weight[0]
index_max = np.argmax(weights)
feature_names = vectorizer_train.get_feature_names_out()
key_feature = feature_names[index_max]
print("The most important word is\n", key_feature)

The most important word is
 derailment


### Answer:
The most important word for deciding whether a tweet is about a real disaster is **derailment**.

# Part (f): Bernoulli Naive Bayes.

In [29]:
class BernoulliNB(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha # alpha to ensure that there will be no zero prior probability
        
    def fit(self, X, y):
        #====================================================#
        # YOUR CODE HERE:
        #  You should build the Bernoully NB model from scratch
        #  Do not use sklearn, use numpy and other basic packages
        #    only.
        #  Please update and save the parameters
        #    "self.class_log_prior_" and "self.feature_prob_"
        #  These variables are just a suggestion to help
        #    structure your code - you do not need to use them
        #    if you would prefer not to
        #====================================================#
        n_samples, n_features = X.shape
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        self.class_log_prior_ = np.log(np.bincount(y)+ self.alpha) / (n_samples + n_classes * self.alpha)
        # log_prior is P(y)
        self.feature_prob_ = np.zeros((n_classes, n_features))
        for class_ in self.classes_:
            rows_to_select = np.where(y == class_)[0]
            X_class = X[rows_to_select]
            self.feature_prob_[class_, :] = (np.sum(X_class, axis=0) + self.alpha) / (X_class.shape[0] + 2 * self.alpha)

            # This is P(x|y)

        #====================================================#
        # END YOUR CODE
        #====================================================#
        return self
        
    
    def predict(self, X):
        X = X.toarray()
        n_samples = X.shape[0]
        self.pred_log_prob_ = np.zeros((n_samples, len(self.classes_)))
        
        for idx, c in enumerate(self.classes_):
            # get P(y)
            log_prob = np.full(X.shape[0], self.class_log_prior_[c])
            # get P(y|x) 在给定特征的情况下，Y类别出现的概率
            log_prob += np.log((X * self.feature_prob_[c]).sum(axis=1) + 1e-10)
            # 先把y=c的情况下x=1的特征求和 再把x=0的特征求和
            log_prob += np.log(((1 - X) * (1 - self.feature_prob_[c])).sum(axis=1))
            # 因为有些单词可能只出现在了disaster，但是可能从来没有出现在非disaster之中
            self.pred_log_prob_[:, idx] = log_prob
        # Search the max value in every row and get the index
        y_pred = self.classes_[np.argmax(self.pred_log_prob_, axis=1)]
        #====================================================#
        # END YOUR CODE
        #====================================================#
        return y_pred

# get the predictions y_train_NB and y_develop_NB
nb = BernoulliNB(alpha=1)
nb.fit(X_train, y_train_orig)
print(X_train.shape)
y_train_NB = nb.predict(X_train) # prediction from X_train using model
y_develop_NB = nb.predict(X_develop) # prediction from X_develop using model

# get the F1 train and develop scores
F1_train_NB = sklearn.metrics.f1_score(y_train_orig, y_train_NB)
F1_develop_NB = sklearn.metrics.f1_score(y_develop_orig, y_develop_NB)

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_NB:.2f}")
print(f"F1 for development set: {F1_develop_NB:.2f}")

(5329, 1929)
F1 for training set: 0.77
F1 for development set: 0.72


# Part (g): Model comparison.

Question: Which model performed the best in predicting whether a tweet is of a real disaster or not? Include your performance metric in your response. Comment on the pros and cons of using generative vs discriminative models.

Answer: In this project, the logistic regression model performed better. The logistic regression model achieved an F1 score of 0.86 on the training set and 0.75 on the development set, while the Bernoulli naive Bayes model achieved an F1 score of 0.60 on the training set and 0.59 on the development set.

Question: hink about the assumptions that Naive Bayes makes. How are the assumptions different from logistic regressions? Discuss whether it is valid and efficient to use Bernoulli Naive Bayes classifier for natural language texts.

Answer:


# Part (h): N-gram model.

In [30]:
def n_gram(data, vectorizer = None):
    if vectorizer is None:
        vectorizer = CountVectorizer(max_features=5000, binary=True, min_df=5, ngram_range=(2,2))
        n_gram = vectorizer.fit_transform(data['text'])
    else:
        n_gram = vectorizer.transform(data['text'])
    return n_gram, vectorizer # Feel free to change the variable name
    #========================================================================#
    # END CODE HERE
    #  This function should return the new data whose "text" feature contains
    #  only 0 and 1
    #========================================================================#

# get the featurized data
X_train_gram, train_n_gram  = n_gram(X_train_preproc)
X_develop_gram, _ = n_gram(X_develop_preproc, train_n_gram)
X_train_gram.toarray()
X_develop_gram.toarray()
matrix_1 = hstack((X_train,X_train_gram))
matrix_2 = hstack((X_develop,X_develop_gram))
X_train_gram = matrix_1
X_develop_gram = matrix_2



In [31]:
#=======================================================================+#
# YOUR CODE HERE:
#  Featurized the preprocessed data: X_train_preproc and X_develop_preproc
#  using the N=2 gram model
#========================================================================#


#=======================================================================+#
# YOUR CODE HERE:
#  Use the functions you already defined "X_train_gram" and "X_develop_gram"
#  to re-run:
#  Logistic Regression with no regularization Model
#  Logistic Regression with L1 regularization Model
#  Logistic Regression with L2 regularization Model
#========================================================================#
y_train_gram_no_reg, y_develop_gram_no_reg = logistic_without_regularization(X_train_gram,y_train_orig,X_develop_gram,y_develop_orig)
y_train_gram_L1_reg, y_develop_gram_L1_reg, _, _ = logistic_L1_regularization(X_train_gram,y_train_orig,X_develop_gram,y_develop_orig)
y_train_gram_L2_reg, y_develop_gram_L2_reg = logistic_L2_regularization(X_train_gram,y_train_orig,X_develop_gram,y_develop_orig)

nb.fit(X_train_gram, y_train_orig)
y_train_gram_NB = nb.predict(X_train_gram) # prediction from X_train using model
y_develop_gram_NB = nb.predict(X_develop_gram) # prediction from X_develop using model

#========================================================================#
# END CODE HERE
#========================================================================#

# get the F1 train and develop scores for no regularization model
F1_train_gram_no_reg = sklearn.metrics.f1_score(y_train_orig, y_train_gram_no_reg)
F1_develop_gram_no_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_no_reg)
# get the F1 train and develop scores for L1 regularization model
F1_train_gram_L1_reg = sklearn.metrics.f1_score(y_train_orig, y_train_gram_L1_reg)
F1_develop_gram_L1_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_L1_reg)
# get the F1 train and develop scores for L2 regularization model
F1_train_gram_L2_reg = sklearn.metrics.f1_score(y_train_orig, y_train_gram_L2_reg)
F1_develop_gram_L2_reg = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_L2_reg)
# get the F1 train and develop scores for Bernoulli NB model
F1_train_gram_NB = sklearn.metrics.f1_score(y_train_orig, y_train_gram_NB)
F1_develop_gram_NB = sklearn.metrics.f1_score(y_develop_orig, y_develop_gram_NB)

# print the F1 train and develop scores for no regularization model
print(f"F1 for training set: {F1_train_gram_NB:.2f}")
print(f"F1 for development set: {F1_develop_gram_NB:.2f}")
# print the F1 train and develop scores for L1 regularization model
print(f"F1 for training set: {F1_train_gram_L1_reg:.2f}")
print(f"F1 for development set: {F1_develop_gram_L1_reg:.2f}")
# print the F1 train and develop scores for L2 regularization model
print(f"F1 for training set: {F1_train_gram_L2_reg:.2f}")
print(f"F1 for development set: {F1_develop_gram_L2_reg:.2f}")
# print the F1 train and develop scores for Bernoulli NB model
print(f"F1 for training set: {F1_train_gram_NB:.2f}")
print(f"F1 for development set: {F1_develop_gram_NB:.2f}")

F1 for training set: 0.77
F1 for development set: 0.73
F1 for training set: 0.84
F1 for development set: 0.73
F1 for training set: 0.87
F1 for development set: 0.75
F1 for training set: 0.77
F1 for development set: 0.73


# Part (i): Determine performance with the test set.

In [32]:
#=======================================================================+#
# YOUR CODE HERE:
#  Re-build your feature vectors on the entire Kaggle train set
#  (i.e. DO NOT split the train set into a further train set and development set)
#========================================================================#
X_train = vstack([X_train_gram, X_develop_gram])
print(X_train.shape)
print(y_train.shape)
#========================================================================#
# END CODE HERE
#========================================================================#

(7613, 2580)
(7613,)


In [33]:
#=======================================================================+#
# YOUR CODE HERE:
#  Re-train your preferred classifier (see below) on the entire train set
#  (i.e. DO NOT split the train set into a further train set and development set)
#  Your preferred classifier may inculde either bag of word or n-gram,
#  and using either logistic regression or Bernoulli naive bayes
#========================================================================#
X_test = test_data['text']
X_final = train_data['text']
y_train = train_data['target']
X_final_orig = X_final
X_test_orig = X_test
X_final_preproc = pre_process(X_final_orig)
X_test_preproc = pre_process(X_test)

def bag_of_word_final(data, vectorizer=None):
    if vectorizer is None:
        vectorizer = CountVectorizer(max_features=5000, binary=True, min_df=5)
        bow_list = vectorizer.fit_transform(data['text'])
    else:
        bow_list = vectorizer.transform(data['text'])
    return bow_list, vectorizer # Feel free to change the variable name
  
X_final, vectorizer_final = bag_of_word_final(X_final_preproc)
print(X_final.shape)
X_test, _ = bag_of_word_final(X_test_preproc, vectorizer_final)

def n_gram_final(data, vectorizer = None):
    if vectorizer is None:
        vectorizer = CountVectorizer(max_features=5000, binary=True, min_df=3, ngram_range=(2,2))
        n_gram = vectorizer.fit_transform(data['text'])
    else:
        n_gram = vectorizer.transform(data['text'])
    return n_gram, vectorizer

X_final_gram, final_n_gram = n_gram_final(X_final_preproc)
X_test_gram, _ = n_gram_final(X_test_preproc, final_n_gram)

matrix_train = hstack((X_final,X_final_gram))
X_final = matrix_train

matrix_test = hstack((X_test,X_test_gram))
X_test = matrix_test

print(X_final.shape)
print(X_test.shape)
print(y_train.shape)


def final_model(X_train, Y_train, X_test):
    # initialize your logistic regression model
    model = LogisticRegression(penalty='l2', solver='liblinear', max_iter=10000)
    # then fit your model to the train data
    model.fit(X_train, Y_train)
    # then generate your prediction for the training set
    y_test_L2_reg = model.predict(X_test)
    return y_test_L2_reg
y_train_pred = final_model(X_final,y_train,X_test)

df = pd.DataFrame(y_train_pred)
df.to_excel("output.xlsx", index=False, engine='openpyxl')
#========================================================================#
# END CODE HERE
#========================================================================#

(7613, 2544)
(7613, 4833)
(3263, 4833)
(7613,)


The final Result is shown as the pic:

F-1 Score: 0.79037

![My Image](./WeChatc8fe774b7206003cd7939b49665c90c9.jpg)

![My Image](./WechatIMG2163.jpg)


In [34]:
#=======================================================================+#
# YOUR CODE HERE:
#  Report the resulting F 1-score on the test data, as reported by Kaggle
#========================================================================#

#========================================================================#
# END CODE HERE
#========================================================================#