# Imports

In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from sklearn.pipeline import Pipeline
import pickle
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

Import the training set:

In [2]:
file_path_train = '...'
df_train = pd.read_csv(file_path_train, sep='\t')

# Data exploration and preparation

The labels have six distinct values, however, we only want to distinguish whether a record has label 'pants-fire', so let us map all other labels to 'other', so that we have only two labels.

In [3]:
labels_mapped = df_train["label"].map(lambda x: x if (x == "pants-fire") else "other")
y = {'label': labels_mapped}
y = pd.DataFrame(y)

Let's check if the set is balanced. As we can see, the label 'pants-fire' makes up only ca. 9% of the set. We will use the ratio counts_val later when fitting Xgboost in order to try to counteract this imbalance.

In [4]:
counts=labels_mapped.value_counts()
counts[1]/(counts[0]+counts[1])
counts_val = counts[1]/counts[0]
counts

other         9426
pants-fire     842
Name: label, dtype: int64

The columns state and speaker_job contain both NaN and None values. I assume that every speaker belongs to a state, we just don't have the information about it, hence NaN and None have the same meaning. When it comes to the job, there are only 3 entries with None: fulton county house of wellness and anonynomous activist. Again, I believe that None just means that the information wasn't obtained and not that any of them was unemployed, so I have decided to map all None values to NaN.

In [5]:
state_mapped = df_train["state"].map(lambda x: x if (x != "None") else np.nan)
speaker_job_mapped = df_train["speaker_job"].map(lambda x: x if (x != "None") else np.nan)
mapped_cols = {'state': state_mapped, 'speaker_job': speaker_job_mapped}
mapped_cols_df = pd.DataFrame(mapped_cols)

Now we need to prepare the columns for their encoding. I have decided to use one hot encoding for the columns 'speaker', 'speaker_job', 'state', 'party', since they do not contain a lot of text. I want to tokenize the other columns. To facilitate that, I merge the columns 'statement', 'subject' and 'context' into one column that I name 'text'.
Both, the OneHotEncoder and TfIdfVectorizer have a problem with nan values, so I replace all nan values in all columns with 'missing_value'.

In [6]:
df_train["text"] = df_train["statement"]+df_train["subject"]+df_train["context"].fillna('') 
df_train_mapped = pd.concat([df_train["text"], state_mapped, speaker_job_mapped, df_train["speaker"], df_train["party"]], 
                            axis = 1)
df_train_mapped = df_train_mapped.fillna("missing_value")

NOTE: This was my first ML project. Today, I would have just done stratified sampling.

As a final step, I have decided to split the training data into a true training set and a holdout set on which I can test my model. I have chosen a very small holdout set size, because the amount of 
pants-fire' recors is very low and I didn't want risk having all of them in the holdout set.

In [15]:
X_train, X_val, y_train, y_val = train_test_split(df_train_mapped, y, test_size=0.05, random_state=0)

The prepped data looks like that:

In [20]:
X_val.head(5)

Unnamed: 0,text,state,speaker_job,speaker,party
4676,"Since 2010, America has put more people back t...",Illinois,President,barack-obama,democrat
2513,"Even when all other state agencies took cuts, ...",missing_value,missing_value,GaGOP,republican
7634,"""First, he said he would take all of our troop...",Delaware,U.S. senator,joe-biden,democrat
8780,Says $30 million gap in stadium funding forced...,Oregon,Mayor of Portland,sam-adams,democrat
4741,Says Sen. Rand Pauls 2011 budget included a bi...,missing_value,missing_value,doug-muder,democrat


# Text Preprocessing and Conversion to Table Format

Now, let's start preprocessing the 'text' column. I wanted to exclude stop words, very short words, special characters and numbers and to incorporate stemming. Still, the amount of attributed would be huge, so I have decided to set a maximum amount of features. Therefore I have defined the following vectorizer (I use the tf-idf statistic, so I based the vectorizer on TfidfVectorizer):

In [7]:
class StemmingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        self.ps = PorterStemmer()
        tokenize = super().build_tokenizer()
        return lambda doc: [self.ps.stem(w) for w in tokenize(doc) if (w not in stopwords.words('english') and len(w)>2)]
    def __init__(self):
        super(StemmingVectorizer, self).__init__()
        self.token_pattern = '[a-zA-Z]*'
        self.max_features = 1000
        self.stop_words = stopwords.words('english')

Let's apply it to the 'text' column of the training set and OneHotEncoder to the other columns:

In [8]:
column_trans = ColumnTransformer([('text_tokenizer',  StemmingVectorizer(), 'text'),
                                    ('one_hot', OneHotEncoder(dtype='int', handle_unknown='ignore'), 
                                     ['state', 'speaker_job', 'speaker', 'party'])],
                                 remainder='drop')

In [16]:
fitted_col_trans = column_trans.fit_transform(X_train)

In [23]:
fitted_col_trans.shape

(9754, 5064)

In [11]:
fitted_col_trans.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Dimensionality Reduction and Fitting a Classifier

For convenience, I have created a pipeline that includes dimensionality reduction and classifier fitting. I have decided to start with fitting a Random Forest as a benchmark for the later models since it is efficient for large data sets (allows for parallell processing) and helps avoid overfitting thanks to bagging. I have also tried other models, shown later.

As for dimensionality reduction, I have decided to fit all three methods that have been introcuded during the lectures, because it's easy to check them all. To fit the dimensionality reduction method and the parameters for the classifier I use GridSearchCV.

Since fitting the model to the training set takes quite a long time, I pickled it.

In [None]:
pipe = Pipeline([('reduce_dim', NMF()), ('classify', RandomForestClassifier())])

params = {
    'reduce_dim': [TruncatedSVD(), NMF(), LatentDirichletAllocation()],
    'reduce_dim__n_components': [10, 50, 100],
    'classify__n_estimators': [10, 50, 100, 150],
    'classify__max_features': ['sqrt', 'log2'],
    'classify__min_samples_leaf': [1, 5, 10],
    'classify__max_depth': [50, 100, 200, None]
}
opt = GridSearchCV(pipe, param_grid=params, n_jobs=-1, scoring='roc_auc', cv = 5, verbose = 11)
opt.fit(fitted_col_trans, y_train)
print(opt.best_params_)
est = opt.best_estimator_


Result:

{'classify__max_depth': None, 'classify__max_features': 'sqrt', 'classify__min_samples_leaf': 5, 'classify__n_estimators': 10, 'reduce_dim': TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=None, tol=0.0), 'reduce_dim__n_components': 100}

In [62]:
with open('RF.pkl', 'wb') as fid:
    pickle.dump(est, fid)   

Now, let us transform the holdout set with the column transformer (i.e. OneHotEncoder and StemmingVEctorizer) and then with the fitted model:

In [9]:
with open('RF.pkl', 'rb') as fp:
    est2 = pickle.load(fp)

In [12]:
fitted_col_trans_val = column_trans.transform(X_val)

In [13]:
predicted_val = est2.predict(fitted_col_trans_val)

Since the model will be markers based on the area under the ROC curve, I wanted to find the AUC on the holdoutset.
Therefore, I had to binarize the labels (1 = 'pants-fire', 0 = 'other') of the predicted labels and the target labels and transform them into lists.

In [14]:
y_val_pred_binary = list(map(lambda x: 1 if (x == "pants-fire") else 0, predicted_val))

y_val_target=y_val.to_numpy()
y_val_target_shaped = y_val_target.transpose()[0]

y_val_target_binary = list(map(lambda x: 1 if (x == "pants-fire") else 0, y_val_target_shaped))

514

In [15]:
roc_score = roc_auc_score(y_val_pred_binary, y_val_target_binary)
roc_score

0.882217847769029

The picked model:
Dimensionality Reduction: TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5, random_state=None, tol=0.0)
Classifier: RandomForestClassifier(max_depth = None, classify_max_features = 'sqrt', classify_mean_samples_leaf = 5,
                                  classify_n_estimators = 10)
AUC on holdout set: 0.89

# Transformation of Test Set and Export of Labels

NOTE: Turns out, that the first model I tried got the best AUC on the holdout set, so I will transform the test set with this model. At the botton of the notebook you can find the other models I have tried and their AUCs, however, I didn't use them in the end.

We have to prepare the test set the same way as the train set, so let us prepare the test set by mapping Nones to nan in the appropriate columns, merging the same columns as for the train set and the using the column transformer that was fitted to the training set:

In [10]:
file_path_test = '...'
df_test = pd.read_csv(file_path_test, sep='\t')

In [11]:
state_mapped_test = df_test["state"].map(lambda x: x if (x != "None") else np.nan)
speaker_job_mapped_test = df_test["speaker_job"].map(lambda x: x if (x != "None") else np.nan)
mapped_cols_test = {'state': state_mapped_test, 'speaker_job': speaker_job_mapped_test}
mapped_cols_df_test = pd.DataFrame(mapped_cols_test)

In [12]:
df_test["text"] = df_test["statement"]+df_test["subject"]+df_test["context"].fillna('') 
df_test_mapped = pd.concat([df_test["text"], state_mapped_test, speaker_job_mapped_test, df_test["speaker"], df_test["party"]], axis = 1)
df_test_mapped = df_test_mapped.fillna("missing_value")

In [13]:
X_test = df_test_mapped

In [None]:
fitted_col_trans_test = column_trans.transform(X_test)

In [23]:
predicted_test = est2.predict(fitted_col_trans_test)

In [34]:
predicted_test_df = pd.DataFrame(predicted_test, columns = ['label'])

The model predicted very few 'pants-fire' labels, which was to be expected, but the ratio of the predicted 'pants-fire' labels to 'other' was much lower than in training set, which is a bit worrisome. There is probably room for improvement, but due to the deadline I have decided to go with this model.

In [57]:
counts_test = predicted_test_df['label'].value_counts()
counts_test[1]/(counts_test[0]+counts_test[1])

0.017153996101364522

In [42]:
with open('predictions.res', 'a') as file:
     file.write(predicted_test_df.to_string(header = False, index = False))

# Other approaches

I have also tried fitting XGboost. I set scale_pos_weight to the ratio of 'other' labels to 'pants-fire' labels in order to counteract the imbalance a bit. I also set subsample to 0.8 to minimize overfitting. However, the AUC on the holdout set was 0.6, which isn't a good result. It would probably take quite some time to find the correct parameter values to be tested in order to beat the RandomForestClassifier, so I decied to stick with the random forest.
I have also tried a Bayes classifier, which was very quick, since I only fitted one parameter, however the resulting AUC was 0.46.

In [None]:
pipe3 = Pipeline([('reduce_dim', NMF()), ('classify', xgb.XGBClassifier(scale_pos_weight = 1/counts_val, subsample = 0.8))])

params3 = {
    'reduce_dim': [TruncatedSVD(), NMF(), LatentDirichletAllocation()],
    'reduce_dim__n_components': [10, 50, 100],
    'classify__n_estimators': [50, 100],
    'classify__eta': [0.001, 0.01, 0.1],
    'classify__max_depth': [3, 6, 9]
}
opt3 = GridSearchCV(pipe3, param_grid=params3, n_jobs=-1, scoring='roc_auc', cv = 5, verbose = 11)
opt3.fit(fitted_col_trans, y_train)
print(opt3.best_params_)
est3 = opt3.best_estimator_
with open('xgboost2.pkl', 'wb') as fid:
    pickle.dump(est3, fid)

Result:

{'classify__eta': 0.1, 'classify__max_depth': 9, 'classify__n_estimators': 100, 'reduce_dim': TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=None, tol=0.0), 'reduce_dim__n_components': 100}

In [27]:
y_val_predicted3 = est3.predict(fitted_col_trans_val)
y_val_pred_binary3 = list(map(lambda x: 1 if (x == "pants-fire") else 0, y_val_predicted3))

y_val_target=y_val.to_numpy()
y_val_target_shaped = y_val_target.transpose()[0]

y_val_target_binary = list(map(lambda x: 1 if (x == "pants-fire") else 0, y_val_target_shaped))
roc_score3 = roc_auc_score(y_val_pred_binary3, y_val_target_binary)
roc_score3

0.6121598639455782

Naive Bayes approach:

Result:

{'classify__alpha': 10}

In [40]:
y_val_predicted4 = est4.predict(fitted_col_trans_val)
y_val_pred_binary4 = list(map(lambda x: 1 if (x == "pants-fire") else 0, y_val_predicted4))
roc_score4 = roc_auc_score(y_val_pred_binary4, y_val_target_binary)
roc_score4

0.46101364522417154