In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading data
train =pd.read_csv('reddit_200k_train.csv', \
                   usecols=['body','REMOVED'], encoding = "ISO-8859-1")
test =pd.read_csv('reddit_200k_test.csv', \
                  usecols=['body','REMOVED'], encoding = "ISO-8859-1")
text_trainval, y_trainval = train['body'], train['REMOVED']

print("length of text_train: ", len(text_trainval))
print("class balance: ", np.bincount(y_trainval) )

length of text_train:  167529
class balance:  [102791  64738]


# Task 1 Bag of Words and simple Features

### 1.1 Create a baseline model using a bag-of-words approach and a linear model.

In [3]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

text_train, text_val, y_train, y_val = \
train_test_split(text_trainval, y_trainval, \
                 stratify = y_trainval, random_state = 0)
vect = CountVectorizer()
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)

Wall time: 17.1 s


In [4]:
#%%time

from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(scoring="roc_auc").fit(X_train, y_train)
lr.score(X_val, y_val)

0.7589150906685347

### 1.2 Try using n-grams, characters, tf-idf rescaling and possibly other ways to tune the BoW model. Be aware that you might need to adjust the (regularization of the) linear model for different feature sets.


#### tf-idf rescaling

In [5]:
%%time
#tf-idf
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text \
import TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 

param_grid ={"logisticregression__C":[10,1,0.1],
            "tfidfvectorizer__ngram_range":[(1,1),(1,2),(2,4)]}

tfidf_pipe = make_pipeline(TfidfVectorizer(min_df=2,stop_words='english'), 
                        Normalizer(), \
                           LogisticRegression(max_iter=100, \
                                              solver = 'sag'))
tfidf_grid = GridSearchCV(tfidf_pipe, \
                          param_grid=param_grid, cv=3, scoring="roc_auc")

tfidf_grid.fit(text_train,y_train)

Wall time: 11min 42s


In [6]:
print("Best Training roc auc score: {:.3f}".format(tfidf_grid.best_score_))
print("best parameters: {}".format(tfidf_grid.best_params_))
print("validation-set score: {:.3f}".format(tfidf_grid.score(text_val, y_val)))

Best Training roc auc score: 0.762
best parameters: {'logisticregression__C': 1, 'tfidfvectorizer__ngram_range': (1, 2)}
validation-set score: 0.763


#### n-grams included stopwords

In [7]:
%%time

# n-grams included stopwords and min_df =4
param_grid ={"logisticregression__C":[10,1,0.1],
            "countvectorizer__ngram_range":[(1,1),(1,2),(2,4)]}
ngrams_pipe = make_pipeline(CountVectorizer(stop_words='english', min_df=4), 
                            Normalizer(), \
                            LogisticRegression(max_iter=100, \
                                               solver = 'sag'))
ngrams_grid = GridSearchCV(ngrams_pipe, \
                           param_grid = param_grid, cv =3, scoring = 'roc_auc')
ngrams_grid.fit(text_train, y_train)

Wall time: 10min 58s


In [79]:
print("Best Training roc auc score: {:.3f}".format(ngrams_grid.best_score_))
print("best parameters: {}".format(ngrams_grid.best_params_))
print("validation-set score: {:.3f}".\
      format(ngrams_grid.score(text_val, y_val)))

Best Training roc auc score: 0.761
best parameters: {'countvectorizer__ngram_range': (1, 2), 'logisticregression__C': 1}
validation-set score: 0.762


#### Characters n-grams

In [80]:
%%time
#characters n-grams
param_grid ={"logisticregression__C":[10,1,0.1],
            "countvectorizer__ngram_range":[(1,1),(1,2),(2,4)]}
char_pipe = make_pipeline(CountVectorizer(stop_words='english', \
                                          analyzer ="char_wb", min_df=4), 
                            Normalizer(), LogisticRegression\
                          (max_iter=100, solver = 'sag'))
char_grid = GridSearchCV(ngrams_pipe, param_grid = param_grid, \
                         cv =3, scoring = 'roc_auc')
char_grid.fit(text_train, y_train)

Wall time: 11min 34s


In [81]:
print("Best Training roc auc score: {:.3f}".format(char_grid.best_score_))
print("best parameters: {}".format(char_grid.best_params_))
print("validation-set score: {:.3f}".format(char_grid.score(text_val, y_val)))

Best Training roc auc score: 0.761
best parameters: {'countvectorizer__ngram_range': (1, 2), 'logisticregression__C': 1}
validation-set score: 0.762


### 1.3 Explore other features you can derive from the text, such as html, length, punctuation, capitalization or other features you deem important from exploring the dataset

In [29]:
# length
len_train = np.vectorize(len)(text_train)

removed
False    292.768487
True     156.719626
Name: length, dtype: float64

In [5]:
# count the number of captilized words
cap_train = text_train.str.findall(r'[A-Z]').str.len()

In [23]:
# count the number of https
import re
html_train =[]
for i in text_train:
    count = len(re.findall(r'(?=http)', i))
    html_train.append(count)

In [50]:
X_train_f =pd.DataFrame(
    {'length': len_train,
     'cap' : cap_train,
     'html' : html_train
    })

In [59]:
# count the number of words
len_val = np.vectorize(len)(text_val)
# count the number of captilized words
cap_val = text_val.str.findall(r'[A-Z]').str.len()
#count the number of https
html_val =[]
for i in text_val:
    count = len(re.findall(r'(?=http)', i))
    html_val.append(count)

In [60]:
d ={'length': len_val, 'cap' : cap_val, 'html' : html_val}
X_val_f =pd.DataFrame(data = d)

In [85]:
%%time
param_grid ={"logisticregression__C" : [100, 10, 1, 0.1]}
lr_f = make_pipeline(LogisticRegression(solver = 'sag'))
f_grid = GridSearchCV(lr_f, param_grid=param_grid, cv=3, scoring="roc_auc")
f_grid.fit(X_train_f, y_train)

Wall time: 1min 13s


In [84]:
print("Best Training roc auc score: {:.3f}".format(f_grid.best_score_))
print("best parameters: {}".format(f_grid.best_params_))
print("validation-set score: {:.3f}".format(f_grid.score(X_val_f, y_val)))

Best Training roc auc score: 0.654
best parameters: {'logisticregression__C': 100}
validation-set score: 0.656


# Task 2 Word Vectors

### Use a pretrained word-embedding (word2vec, glove or fasttext) instead of the bag-of-words model. Does this improve classification?

In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv('reddit_200k_train.csv', \
                    encoding='ISO-8859-1', index_col=0)
test = pd.read_csv('reddit_200k_test.csv', encoding='ISO-8859-1', index_col=0)

In [2]:
## X_train, y_train, X_test, y_test
X_train = train['body'].values
y_train = (train['REMOVED']*1).values
X_test = test['body'].values
y_test = (test['REMOVED']*1).values

In [3]:
## initialize word2vec model
from gensim import models
w = models.KeyedVectors.load_word2vec\
_format('GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vect_w2v = CountVectorizer(vocabulary=w.index2word)
vect_w2v.fit(X_train)
doc_train = vect_w2v.inverse_transform(vect_w2v.transform(X_train))
doc_test = vect_w2v.inverse_transform(vect_w2v.transform(X_test))

In [5]:
docs = []
y_train_0 = []
for i in range(len(doc_train)):
    if len(doc_train[i]) > 0:
        docs.append(doc_train[i])
        y_train_0.append(y_train[i])
X_train_w2v = np.vstack([np.mean(w[doc], axis=0) for doc in docs])

In [6]:
docs = []
y_test_0 = []
for i in range(len(doc_test)):
    if len(doc_test[i]) > 0:
        docs.append(doc_test[i])
        y_test_0.append(y_test[i])
X_test_w2v = np.vstack([np.mean(w[doc], axis=0) for doc in docs])

In [8]:
## predict using logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
lr_w2v = cross_validate(LogisticRegression(), \
                        X_train_w2v, y_train_0, scoring='roc_auc', cv=3)
score = np.mean(lr_w2v['test_score'])

In [9]:
print("Cross validation roc-auc score: {:.3f}".format(score))

Cross validation roc-auc score: 0.728


Since the score is lower than what the team obtained in 1.1 and 1.2, we could conclude that the model doesn't improve classification.