In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import xgboost as xgb
import re
from scipy.sparse import coo_matrix, hstack
from gensim import models
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
from gensim import models
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Inhen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [68]:
# def print_recall_precision(m):
#     r0 = m[0][0]/sum(m[0,:])
#     p0 = m[0][0]/sum(m[:,0])
#     r1 = m[1][1]/sum(m[1,:])
#     p1 = m[1][1]/sum(m[:,1])
#     print("recall for class 0 is", r0 )
#     print("precision for class 0 is", p0)
#     print("recall for class 1 is", r1 )
#     print("precision for class 1 is", p1)

In [2]:
test_df = pd.read_csv('reddit_200k_test.csv',encoding="latin",usecols=['body','REMOVED'])

In [3]:
train_df = pd.read_csv('reddit_200k_train.csv',encoding="latin",usecols=['body','REMOVED'])

In [4]:
test_df["REMOVED"] = test_df.REMOVED.astype(int)
train_df["REMOVED"] = train_df.REMOVED.astype(int)

In [5]:
train_df.head()

Unnamed: 0,body,REMOVED
0,I've always been taught it emerged from the ea...,0
1,"As an ECE, my first feeling as ""HEY THAT'S NOT...",1
2,Monday: Drug companies stock dives on good new...,1
3,i learned that all hybrids are unfertile i won...,0
4,Well i was wanting to get wasted tonight. Not...,0


In [6]:
test_df.head()

Unnamed: 0,body,REMOVED
0,"Hi Larpo_Nadar, your submission has been remov...",1
1,"So out of every 10,000 children with autism wh...",0
2,"When I was pregnant, I was warned against eati...",0
3,Imagine if this find was the bug that eradicat...,1
4,Is it a myth that the math says it would take ...,0


In [8]:
print('class balance in training set: ',train_df.REMOVED.mean())
print('it can be considered as an inbalanced dataset')

class balance in training set:  0.38642861832876696
it can be considered as an inbalanced dataset


# Task 1

## Task 1.1

##### Train validation Split

In [9]:
text_train, text_val, y_train, y_val = train_test_split(
    train_df['body'],train_df['REMOVED'], random_state=0)
vect = CountVectorizer()
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x97465 sparse matrix of type '<class 'numpy.int64'>'
 	with 3762792 stored elements in Compressed Sparse Row format>,
 <41883x97465 sparse matrix of type '<class 'numpy.int64'>'
 	with 1236528 stored elements in Compressed Sparse Row format>)

##### Check some of the words in the features

In [10]:
feature_names = vect.get_feature_names()
print(feature_names[60000:60030])

['mows', 'moxley', 'moy', 'moya', 'moyer', 'moynihan', 'moysiuk', 'mozart', 'mozarts', 'mp', 'mp2011175a', 'mp2016168a', 'mp2016232a', 'mp2016263', 'mp2017155a', 'mp2017201a', 'mp201723a', 'mp201734a', 'mp201744a', 'mp3', 'mp4', 'mpa', 'mpaa', 'mpas', 'mpb', 'mpcâ', 'mpfc', 'mpfeygx', 'mpg', 'mph']


#### Here we choose Logistic Regression as our baseline model

In [11]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [12]:
print('parameter C =',lr.C_)

parameter C = [0.04641589]


In [13]:
print('baseline accuracy score')
lr.score(X_val, y_val)

baseline accuracy score


0.6934794546713464

In [18]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.81      0.72      0.76     28804
           1       0.51      0.63      0.56     13079

   micro avg       0.69      0.69      0.69     41883
   macro avg       0.66      0.68      0.66     41883
weighted avg       0.72      0.69      0.70     41883



### we will use weighted avg precision as our main focus, so the benchmark has a Avg precision = 0.72

## Task 1.2

### In this task, it would be so slow and difficult to use a pipeline and gridsearch all the parameters, therefore, I will just present some of the results I have tried

### Remove special characters using reg ex and remove stop words

##### Train test split

In [25]:
vect = CountVectorizer(token_pattern=r"\b\w[\w’]+\b",stop_words='english')
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x97156 sparse matrix of type '<class 'numpy.int64'>'
 	with 2187591 stored elements in Compressed Sparse Row format>,
 <41883x97156 sparse matrix of type '<class 'numpy.int64'>'
 	with 710379 stored elements in Compressed Sparse Row format>)

##### Check some of the words in the features

In [26]:
feature_names = vect.get_feature_names()
print(feature_names[60000:60030])

['muchâ', 'mucinex', 'muciniphila', 'muck', 'muckenthaler', 'mucking', 'mucky', 'mucoepidermoid', 'mucosa', 'mucosal', 'mucous', 'mucus', 'mud', 'mudcrab', 'muddied', 'muddle', 'muddled', 'muddy', 'muddying', 'muder', 'muderous', 'mudpiles', 'mudslides', 'mudstones', 'mueller', 'muerv', 'mues', 'muesli', 'muffin', 'muffins']


#### Here we also check the result of Logistic Regression compared with our baseline model

In [27]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [28]:
print('parameter C =',lr.C_)

parameter C = [0.04641589]


##### The result is getting slightly worse

In [29]:
lr.score(X_val, y_val)

0.6845736933839506

In [30]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.82      0.71      0.76     29445
           1       0.48      0.62      0.54     12438

   micro avg       0.68      0.68      0.68     41883
   macro avg       0.65      0.67      0.65     41883
weighted avg       0.72      0.68      0.69     41883



### there is no significant improvement in Avg precision in this model

### Try only remove special characters using reg ex but not remove stop words

In [31]:

vect = CountVectorizer(token_pattern=r"\b\w[\w’]+\b")
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x97465 sparse matrix of type '<class 'numpy.int64'>'
 	with 3762792 stored elements in Compressed Sparse Row format>,
 <41883x97465 sparse matrix of type '<class 'numpy.int64'>'
 	with 1236528 stored elements in Compressed Sparse Row format>)

##### Check some of the words in the features

In [32]:
feature_names = vect.get_feature_names()
print(feature_names[60000:60030])

['mows', 'moxley', 'moy', 'moya', 'moyer', 'moynihan', 'moysiuk', 'mozart', 'mozarts', 'mp', 'mp2011175a', 'mp2016168a', 'mp2016232a', 'mp2016263', 'mp2017155a', 'mp2017201a', 'mp201723a', 'mp201734a', 'mp201744a', 'mp3', 'mp4', 'mpa', 'mpaa', 'mpas', 'mpb', 'mpcâ', 'mpfc', 'mpfeygx', 'mpg', 'mph']


#### Here we also check the result of Logistic Regression compared with our baseline model

In [None]:
lr = LogisticRegressionCV().fit(X_train, y_train)

In [34]:
print('parameter C =',lr.C_)

parameter C = [0.04641589]


##### The result is getting slightly better, which indicates that it is useful to remove special characters but removing english stopping words would reduce the information

In [35]:
lr.score(X_val, y_val)

0.6934794546713464

In [36]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.81      0.72      0.76     28804
           1       0.51      0.63      0.56     13079

   micro avg       0.69      0.69      0.69     41883
   macro avg       0.66      0.68      0.66     41883
weighted avg       0.72      0.69      0.70     41883



### Try use Tfidf to rescale the training data

In [37]:

vect = TfidfVectorizer(token_pattern=r"\b\w[\w’]+\b")
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x97465 sparse matrix of type '<class 'numpy.float64'>'
 	with 3762792 stored elements in Compressed Sparse Row format>,
 <41883x97465 sparse matrix of type '<class 'numpy.float64'>'
 	with 1236528 stored elements in Compressed Sparse Row format>)

In [38]:
feature_names = vect.get_feature_names()
print(feature_names[60000:60030])

['mows', 'moxley', 'moy', 'moya', 'moyer', 'moynihan', 'moysiuk', 'mozart', 'mozarts', 'mp', 'mp2011175a', 'mp2016168a', 'mp2016232a', 'mp2016263', 'mp2017155a', 'mp2017201a', 'mp201723a', 'mp201734a', 'mp201744a', 'mp3', 'mp4', 'mpa', 'mpaa', 'mpas', 'mpb', 'mpcâ', 'mpfc', 'mpfeygx', 'mpg', 'mph']


#### Here we also check the result of Logistic Regression compared with our baseline model

In [39]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [40]:
print('parameter C =',lr.C_)

parameter C = [0.35938137]


##### The result is getting slightly better, which indicates that it is useful to remove special characters and use tf-idf to rescale works

In [41]:
lr.score(X_val, y_val)

0.6996633478977151

In [42]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.82      0.72      0.77     29055
           1       0.51      0.64      0.57     12828

   micro avg       0.70      0.70      0.70     41883
   macro avg       0.66      0.68      0.67     41883
weighted avg       0.73      0.70      0.71     41883



### There is a small improvement in weighted avg precision in this model, Avg precision = 0.73

### Use n-gram with tf-idf rescaling, removing special characters, and set min_df lower bound

#### n_gram range = (1,2), min_df = 4

In [43]:

vect = TfidfVectorizer(token_pattern=r"\b\w[\w’]+\b",ngram_range=(1,2), min_df=4)
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x175711 sparse matrix of type '<class 'numpy.float64'>'
 	with 6742028 stored elements in Compressed Sparse Row format>,
 <41883x175711 sparse matrix of type '<class 'numpy.float64'>'
 	with 2201371 stored elements in Compressed Sparse Row format>)

##### As we can see, the matrix becomes larger

In [44]:
feature_names = vect.get_feature_names()
print(feature_names[60000:60030])

['getting new', 'getting off', 'getting old', 'getting older', 'getting on', 'getting one', 'getting out', 'getting paid', 'getting people', 'getting phd', 'getting pregnant', 'getting prescribed', 'getting pretty', 'getting proper', 'getting published', 'getting pushed', 'getting raises', 'getting ready', 'getting real', 'getting really', 'getting removed', 'getting rich', 'getting rid', 'getting ridiculous', 'getting screwed', 'getting shit', 'getting shot', 'getting sick', 'getting smarter', 'getting so']


#### Here we also check the result of Logistic Regression compared with our baseline model

In [45]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [46]:
print('parameter C =',lr.C_)

parameter C = [2.7825594]


##### The result is more or less the same

In [47]:
lr.score(X_val, y_val)

0.6978965212616097

In [48]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.80      0.73      0.76     27799
           1       0.54      0.63      0.58     14084

   micro avg       0.70      0.70      0.70     41883
   macro avg       0.67      0.68      0.67     41883
weighted avg       0.71      0.70      0.70     41883



### Same as the precision

### n_gram range = (1,2), min_df = 4 and use stop words = 'english'

#### In order to reduce the matrix demension, we can try exclude stop words

In [54]:
vect = TfidfVectorizer(token_pattern=r"\b\w[\w’]+\b",ngram_range=(1,2), min_df=4,stop_words='english')
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x88000 sparse matrix of type '<class 'numpy.float64'>'
 	with 2731221 stored elements in Compressed Sparse Row format>,
 <41883x88000 sparse matrix of type '<class 'numpy.float64'>'
 	with 880271 stored elements in Compressed Sparse Row format>)

#### Here we also check the result of Logistic Regression compared with our baseline model

In [55]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [56]:
print('parameter C =',lr.C_)

parameter C = [0.35938137]


##### The result does not have many changes

In [57]:
lr.score(X_val, y_val)

0.6946493804168756

In [58]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.84      0.71      0.77     30085
           1       0.47      0.65      0.54     11798

   micro avg       0.69      0.69      0.69     41883
   macro avg       0.65      0.68      0.66     41883
weighted avg       0.73      0.69      0.71     41883



### n_gram range = (4,4), min_df = 4 and no stopping words

In [59]:
vect = TfidfVectorizer(token_pattern=r"\b\w[\w’]+\b",ngram_range=(4,4), min_df=4)
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x28682 sparse matrix of type '<class 'numpy.float64'>'
 	with 301804 stored elements in Compressed Sparse Row format>,
 <41883x28682 sparse matrix of type '<class 'numpy.float64'>'
 	with 88541 stored elements in Compressed Sparse Row format>)

#### Here we also check the result of Logistic Regression compared with our baseline model

In [60]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [61]:
print('parameter C =',lr.C_)

parameter C = [0.35938137]


##### The result is not so good, 4 gram might be a little bit large for this dataset

In [62]:
lr.score(X_val, y_val)

0.6194637442399064

In [63]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.97      0.62      0.76     40164
           1       0.06      0.59      0.11      1719

   micro avg       0.62      0.62      0.62     41883
   macro avg       0.52      0.61      0.44     41883
weighted avg       0.94      0.62      0.73     41883



### The average precision is amazingly high but that is because the model made a sacrifice on the average recall

### Next Step is to do character-wise vectorizer and use tf-idf scale 

### n_gram range = (2,4),and no stopping words, analyzer = 'char_wb' (add attention to word boundary)

In [64]:
vect = TfidfVectorizer(token_pattern=r"\b\w[\w’]+\b",ngram_range=(2,3), analyzer='char_wb')
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train,X_val

(<125646x69520 sparse matrix of type '<class 'numpy.float64'>'
 	with 29913226 stored elements in Compressed Sparse Row format>,
 <41883x69520 sparse matrix of type '<class 'numpy.float64'>'
 	with 9957372 stored elements in Compressed Sparse Row format>)

#### Here we also check the result of Logistic Regression compared with our baseline model

In [65]:
feature_names = vect.get_feature_names()
print(feature_names[40000:40020])

['i3-', 'i30', 'i32', 'i34', 'i36', 'i3d', 'i3g', 'i3k', 'i3m', 'i3n', 'i3o', 'i4', 'i4 ', 'i4)', 'i4/', 'i45', 'i48', 'i4h', 'i4k', 'i4z']


In [66]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [67]:
print('parameter C =',lr.C_)

parameter C = [0.35938137]


##### The accuracy is the best so far. But the features are character based and they are hard to explain

In [68]:
lr.score(X_val, y_val)

0.7088317455769644

In [69]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.82      0.74      0.78     28575
           1       0.53      0.65      0.59     13308

   micro avg       0.71      0.71      0.71     41883
   macro avg       0.68      0.69      0.68     41883
weighted avg       0.73      0.71      0.72     41883



### Both weighted avg and weighted recall is very high

#### Try a nonlinear model: Xgbclassifier, use the same X_train. I will just use the parameter by default since it takes too much time to tune

In [70]:
clf = xgb.XGBClassifier()

In [71]:
clf.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [72]:
clf.score(X_val, y_val)

0.6917365040708641

In [73]:
pred = clf.predict(X_val)

In [74]:
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.82      0.72      0.76     29193
           1       0.49      0.63      0.55     12690

   micro avg       0.69      0.69      0.69     41883
   macro avg       0.66      0.67      0.66     41883
weighted avg       0.72      0.69      0.70     41883



### Change to xgboost model and we still get a similar result, which means our logistic regression is not bad compared to other models

### The conclusion for Task 1.2
### So far the best two models are logistic regression models with the following parameters:
### 1. tf-idf scale and remove special characters
### 2. use tf-idf scale, n_gram range = (2,4),and no stopping words, analyzer = 'char_wb' (add attention to word boundary)

### We also see the logistic regression is good enough compared with other models i.e. xgboost in this task 

## Task 1.3 

### I'm going to add several features into the model
### 1. length of the body
### 2. count of numbers in the body
### 3. count of upper case characters in the body
### 4. count of total letters in the body

In [75]:
train_df['length'] = train_df.body.apply(lambda x:len(x))
test_df['length'] = test_df.body.apply(lambda x:len(x))
train_df['numbers'] = train_df.body.apply(lambda x: len([1 for y in x if y.isdigit()]))
test_df['numbers'] = test_df.body.apply(lambda x: len([1 for y in x if y.isdigit()]))
train_df['uppercase'] = train_df.body.apply(lambda x: len([1 for y in x if y.isupper()]))
test_df['uppercase'] = test_df.body.apply(lambda x: len([1 for y in x if y.isupper()]))
train_df['characters'] = train_df.body.apply(lambda x: len(re.sub('[^A-Za-z]+', '', x)))
test_df['characters'] = test_df.body.apply(lambda x: len(re.sub('[^A-Za-z]+', '', x)))


In [77]:
train_df.head()

Unnamed: 0,body,REMOVED,length,numbers,uppercase,characters
0,I've always been taught it emerged from the ea...,0,125,0,2,102
1,"As an ECE, my first feeling as ""HEY THAT'S NOT...",1,229,0,17,176
2,Monday: Drug companies stock dives on good new...,1,61,0,2,50
3,i learned that all hybrids are unfertile i won...,0,139,5,0,112
4,Well i was wanting to get wasted tonight. Not...,0,84,0,2,65


In [76]:
text_train, text_val, y_train, y_val = train_test_split(
    train_df[['body','length','numbers','uppercase','characters']],train_df['REMOVED'], random_state=0)


#### Try tf-idf scale with special characters removed

In [78]:
vect = TfidfVectorizer(token_pattern=r"\b\w[\w’]+\b")
X_train = vect.fit_transform(text_train['body'])
X_train = hstack((X_train,text_train[['length','numbers','uppercase','characters']].values))
X_val = vect.transform(text_val['body'])
X_val = hstack((X_val,text_val[['length','numbers','uppercase','characters']].values))


In [79]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [80]:
print('parameter C =',lr.C_)

parameter C = [10000.]


##### The result shows a small improvement compared to the training set without any features

In [81]:
lr.score(X_val, y_val)

0.700690017429506

##### check the parameters for 'length','numbers','uppercase','characters'

In [82]:
coefficient = lr.coef_
/np.mean(coefficient)

-0.009977240861050259

#### However, the coefficients for new features are not significantly large compared to other coefficients

In [83]:
coefficient[:,-4:]

array([[ 0.00179442, -0.00881818,  0.01316273, -0.00310044]])

In [84]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.79      0.74      0.76     27452
           1       0.56      0.63      0.59     14431

   micro avg       0.70      0.70      0.70     41883
   macro avg       0.67      0.68      0.68     41883
weighted avg       0.71      0.70      0.70     41883



#### Try tf-idf scale with n_gram range = (2,4),and no stopping words, analyzer = 'char_wb' (add attention to word boundary) with additional features

In [85]:
vect = TfidfVectorizer(token_pattern=r"\b\w[\w’]+\b",ngram_range=(2,3), analyzer='char_wb')
X_train = vect.fit_transform(text_train['body'])
X_train = hstack((X_train,text_train[['length','numbers','uppercase','characters']].values))
X_val = vect.transform(text_val['body'])
X_val = hstack((X_val,text_val[['length','numbers','uppercase','characters']].values))


In [86]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [87]:
print('parameter C =',lr.C_)

parameter C = [2.7825594]


##### Again, the result shows a small improvement compared to the training set without any features

In [88]:
lr.score(X_val, y_val)

0.7050115798772771

##### check the parameters for 'length','numbers','uppercase','characters'

In [89]:
coefficient = lr.coef_
/np.mean(coefficient)

-0.005761817608622296

#### However, the coefficients for new features are not significantly large compared to other coefficients

In [90]:
coefficient[:,-4:]

array([[ 0.00522884, -0.01428815,  0.00757211, -0.00675364]])

In [91]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.82      0.73      0.77     28641
           1       0.53      0.65      0.58     13242

   micro avg       0.71      0.71      0.71     41883
   macro avg       0.67      0.69      0.68     41883
weighted avg       0.73      0.71      0.71     41883



## Conclusion: adding features like body length, numbers count in the body, upper case count in the body, letter count in the body slightly improve the performance 

# Task 2

In [92]:
w = models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)


In [93]:
def get_vector(sentence):
    word_ls = sentence.split(' ')
    c = 0
    sum_v = np.zeros(300,)
    for wd in word_ls:
        try:
            sum_v = sum_v+w[wd]
            c += 1
        except:
            pass
    if c == 0:
        return sum_v
    return sum_v/c
        


In [94]:
def get_feature10(sentence):
    word_ls = sentence.split(' ')
    c = 0
    v = []
    for wd in word_ls:
        if c >= 10:
            return v
        try:
            v.extend(w[wd])
            c += 1
        except:
            pass
    diff = 10 - c

    v.extend(np.zeros(300*diff,))
    
    return v

In [96]:
# %%timeit
# train_df.head(1000).body.apply(lambda x: get_feature50(x))

In [97]:
train_df["doc_vect"] = train_df.body.apply(lambda x: get_vector(x))

In [98]:
text_train, text_val, y_train, y_val = train_test_split(
    train_df['doc_vect'],train_df['REMOVED'], random_state=0)

X_train = [x.tolist() for x in text_train.values]
X_val = [x.tolist() for x in text_val.values]

In [99]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [100]:
print('parameter C =',lr.C_)

parameter C = [2.7825594]


### This accuracy is not as good as the models in task 1

In [101]:
lr.score(X_val, y_val)

0.6669531790941432

In [102]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.85      0.68      0.76     32107
           1       0.37      0.62      0.46      9776

   micro avg       0.67      0.67      0.67     41883
   macro avg       0.61      0.65      0.61     41883
weighted avg       0.74      0.67      0.69     41883



### The avg precision and recall are also bad

### Try to add some additional features: like body length, numbers count in the body, upper case count in the body, letter count in the body in the model

In [103]:
text_train, text_val, y_train, y_val = train_test_split(
    train_df[['length','numbers','uppercase','characters','doc_vect']],train_df['REMOVED'], random_state=0)

X_train = [x[:4].tolist()+x[4].tolist() for x in text_train.values]
X_val = [x[:4].tolist()+x[4].tolist() for x in text_val.values]

In [104]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [105]:
print('parameter C =',lr.C_)

parameter C = [21.5443469]


#### we only see a slight improvement in the result, compared to the model without additional features

In [106]:
lr.score(X_val, y_val)

0.6687677578014947

In [107]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.84      0.69      0.76     31527
           1       0.39      0.61      0.48     10356

   micro avg       0.67      0.67      0.67     41883
   macro avg       0.62      0.65      0.62     41883
weighted avg       0.73      0.67      0.69     41883



### The accuracy, avg precision, avg recall of two models above is not as good as our model in task1, but this maybe because we only use 300 features for each comment. In task 1, we usually use (,100000) sparse as our feature space, so we can increase our feature vector to 10 words * 300 dimensions = 3000 features, the idea is to get the first 50 words and their vector representation. For each comment, if longer than 10 words, we cut, if shorter than 10 words, we use 0's to pad.

In [108]:
train_df["10 features"] = train_df.body.apply(lambda x: get_feature10(x))

In [109]:
text_train, text_val, y_train, y_val = train_test_split(
    train_df["10 features"],train_df['REMOVED'], random_state=0)

X_train = [x for x in text_train.values]
X_val = [x for x in text_val.values]

In [110]:
lr = LogisticRegressionCV().fit(X_train, y_train)



In [111]:
print('parameter C =',lr.C_)

parameter C = [0.00599484]


In [112]:
lr.score(X_val, y_val)

0.6601485089415754

In [113]:
pred = lr.predict(X_val)
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       0.82      0.68      0.75     30836
           1       0.40      0.59      0.48     11047

   micro avg       0.66      0.66      0.66     41883
   macro avg       0.61      0.64      0.61     41883
weighted avg       0.71      0.66      0.68     41883



### as we can see from this result, using first 10 words to represent a row does not get us a better result. It could be better if we use more words but due to the memory issue of my computer, I can only run the model with 10 words

# In conclusion, if we only look at accuracy, the best model in this HW is using WoB and using tf-idf scale with n_gram range = (2,4),and no stopping words, analyzer = 'char_wb' (add attention to word boundary) with 4 additional features. 
## The accuracy is 0.708831
## In terms of other accuracy matrix, this model also has a balanced avg weighted recall and precision.
## weighted avg recall = 0.71
## weighted avg  precision = 0.73 
