# Fake News Classifier

## 0. Setup

In [26]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [27]:
import pandas as pd
import re
import sklearn
import xgboost as xgb
import warnings

In [28]:
# Custom functions
%load_ext autoreload
%autoreload 2
from helpers import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
warnings.filterwarnings("ignore")
seed = 42

## 1. Read the train data

In [30]:
train_df = pd.read_csv("./dataset/training_data.csv", sep="\t", header=None)
train_df.columns = ["label", "corpus"]
train_df

Unnamed: 0,label,corpus
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...
...,...,...
34147,1,tears in rain as thais gather for late king's ...
34148,1,pyongyang university needs non-u.s. teachers a...
34149,1,philippine president duterte to visit japan ah...
34150,1,japan's abe may have won election\tbut many do...


In [31]:
# Split between X and y
X = train_df["corpus"]
y = train_df["label"]

## 2. Clean data

### 2.1. Review Data

In [32]:
def find_all(find, text):
    result = re.findall(find, text)
    return result

print("How many JavaScript scripts are contained in the corpus?", X.apply(lambda x: len(find_all(r'<[/]?script[\s>]{1}', x.lower()))).sum())
print("How many CSS scripts are contained in the corpus?", X.apply(lambda x: len(find_all(r'<[/]?style[\s>]{1}', x.lower()))).sum())
print("How many HTML tags are contained in the corpus?", X.apply(lambda x: len(find_all(r'<[/a-zA-Z]{1,}[\s>]{1}', x))).sum())

How many JavaScript scripts are contained in the corpus? 0
How many CSS scripts are contained in the corpus? 0
How many HTML tags are contained in the corpus? 0


**Analysis:**
- We don't need to clean HTML, Javascript or HTML code

### 2.2. Clean Data

In [33]:
def clean_serie(serie):
    # Remove numbers and special numbers
    serie = serie.replace(r"[^a-zA-Z]", " ", regex=True, inplace=False)
    # Remove single chars
    serie = serie.replace(r'^(\w{1}\s)|(\s\w{1}\s)|(\s\w{1})$', " ", regex=True, inplace=False)
    # Remove multiple empty spaces
    serie = serie.replace(r"\s{2,}", " ", regex=True, inplace=False)
    # Make all string lowercase
    serie = serie.str.lower()
    return serie

X = clean_serie(X)

checkline = 3
print("Original:", train_df["corpus"][checkline])
print("Cleaned: ", X[checkline])

Original: trump is so obsessed he even has obama‚s name coded into his website (images)
Cleaned:  trump is so obsessed he even has obama name coded into his website images 


**Analysis**:
- We decided to format the data by:
    - Removing special characters and numbers
    - Remove single chars
    - Remove multiple spaces
    - Transform all the text to lowercase

## 3. Split and vectorize data

In [34]:
# Split Data between train and test
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=.2, random_state=seed)

# Prepare CountVector
count_vector = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(2,2))

# Transform fit and transform data
train_ds = count_vector.fit_transform(X_train)
test_ds = count_vector.transform(X_test)

## 4. Train the model

In [35]:
# Set and fit Classifier Model
rfc = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, criterion='entropy', random_state=seed)
rfc.fit(train_ds, y_train)

0,1,2
,n_estimators,200
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 5. Evaluate the model

In [36]:
# Get Test predictions
predictions = rfc.predict(test_ds)

print(sklearn.metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.78      0.90      0.83      3529
           1       0.87      0.72      0.79      3302

    accuracy                           0.81      6831
   macro avg       0.82      0.81      0.81      6831
weighted avg       0.82      0.81      0.81      6831



## 6. Improve Predictions

### 6.1 Compare classifiers and vectorizers

We'll compare the next models with different classifiers to see how do they perform (on default settings) with our data.

We'll focus on Accuracy and Recall since the last one is the metric we want to improve in order to avoid False Negatives (we will penalize Fake News classified as Real News)

In [37]:
models = [sklearn.neighbors.KNeighborsClassifier(),
          sklearn.linear_model.LogisticRegression(random_state=seed),
          sklearn.tree.DecisionTreeClassifier(random_state=seed),
          sklearn.ensemble.RandomForestClassifier(random_state=seed, n_jobs=-1),
          sklearn.ensemble.AdaBoostClassifier(random_state=seed),
          xgb.XGBClassifier(random_state=seed),
          sklearn.naive_bayes.BernoulliNB()]

#### Count Vectorizer

In [38]:
vectorizers = [sklearn.feature_extraction.text.CountVectorizer()]
results = compare_vectorizers(vectorizers, models, X_train, y_train, X_test, y_test)
display(results)

Unnamed: 0,model,fit_time,vectorizer,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test
0,KNeighborsClassifier,0.000839,CountVectorizer,0.607298,0.955016,0.201461,0.332732,0.573854,0.896552,0.133858,0.232938
1,LogisticRegression,5.037453,CountVectorizer,0.981223,0.976199,0.985389,0.980773,0.949349,0.939619,0.956693,0.948079
2,DecisionTreeClassifier,2.326977,CountVectorizer,1.0,1.0,1.0,1.0,0.879373,0.881701,0.866747,0.87416
3,RandomForestClassifier,1.151162,CountVectorizer,1.0,1.0,1.0,1.0,0.929732,0.938471,0.914597,0.92638
4,AdaBoostClassifier,0.558818,CountVectorizer,0.773508,0.694566,0.95308,0.803543,0.782316,0.701086,0.958207,0.809725
5,XGBClassifier,5.25947,CountVectorizer,0.916841,0.880672,0.958804,0.918079,0.908066,0.87451,0.945488,0.908615
6,BernoulliNB,0.003123,CountVectorizer,0.95271,0.939756,0.964528,0.951981,0.94481,0.933333,0.953967,0.943538


**Analysis**:
- The 2 best behaving models for our data and CountVectorizer are:
    - LogisticRegression (accuracy: 0.944, recall: 0.956)
    - BernoulliNB        (accuracy: 0.937, recall: 0.944)

#### TF-IDF Vectorizer

In [39]:
vectorizers = [sklearn.feature_extraction.text.TfidfVectorizer()] # max_features=1000, stop_words="english"
result = compare_vectorizers(vectorizers, models, X_train, y_train, X_test, y_test)

display(result)
results = pd.concat([results, result], ignore_index=True)

Unnamed: 0,model,fit_time,vectorizer,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test
0,KNeighborsClassifier,0.001048,TfidfVectorizer,0.930383,0.919407,0.939072,0.929136,0.894305,0.875655,0.91066,0.892815
1,LogisticRegression,1.923204,TfidfVectorizer,0.961751,0.952973,0.969122,0.96098,0.94481,0.931798,0.955784,0.943639
2,DecisionTreeClassifier,2.217274,TfidfVectorizer,1.0,1.0,1.0,1.0,0.883326,0.872214,0.888855,0.880456
3,RandomForestClassifier,0.998479,TfidfVectorizer,1.0,1.0,1.0,1.0,0.934417,0.92445,0.941248,0.932773
4,AdaBoostClassifier,1.547328,TfidfVectorizer,0.791442,0.715537,0.947582,0.815372,0.798565,0.719963,0.954573,0.820833
5,XGBClassifier,20.88038,TfidfVectorizer,0.93247,0.90311,0.964528,0.932809,0.918021,0.886633,0.95215,0.918224
6,BernoulliNB,0.003043,TfidfVectorizer,0.95271,0.939756,0.964528,0.951981,0.94481,0.933333,0.953967,0.943538


**Analysis**:
- The 2 best performing model for TfidfVectorizer are:
    - LogisticRegression  (accuracy: 0.937, recall: 0.950)
    - BernoulliNB         (accuracy: 0.937, recall: 0.944)
- RandomForestClassifier is a good candidate to be considered since has results very close to BernoulliNB
- While these results are not far, CountVector seems to perform a bit better without changing configurations.

#### Hashing Vectorizer

In [40]:
vectorizers = [sklearn.feature_extraction.text.TfidfVectorizer()]
result = compare_vectorizers(vectorizers, models, X_train, y_train, X_test, y_test)

display(result)
results = pd.concat([results, result], ignore_index=True)

Unnamed: 0,model,fit_time,vectorizer,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test
0,KNeighborsClassifier,0.000904,TfidfVectorizer,0.930383,0.919407,0.939072,0.929136,0.894305,0.875655,0.91066,0.892815
1,LogisticRegression,1.648878,TfidfVectorizer,0.961751,0.952973,0.969122,0.96098,0.94481,0.931798,0.955784,0.943639
2,DecisionTreeClassifier,2.198216,TfidfVectorizer,1.0,1.0,1.0,1.0,0.883326,0.872214,0.888855,0.880456
3,RandomForestClassifier,0.994364,TfidfVectorizer,1.0,1.0,1.0,1.0,0.934417,0.92445,0.941248,0.932773
4,AdaBoostClassifier,1.518781,TfidfVectorizer,0.791442,0.715537,0.947582,0.815372,0.798565,0.719963,0.954573,0.820833
5,XGBClassifier,21.289124,TfidfVectorizer,0.93247,0.90311,0.964528,0.932809,0.918021,0.886633,0.95215,0.918224
6,BernoulliNB,0.003031,TfidfVectorizer,0.95271,0.939756,0.964528,0.951981,0.94481,0.933333,0.953967,0.943538


In [None]:
results.to_csv("./output/improve_pred_results.csv")
display(results)

**Analysis**:
- The 2 best performing model for TfidfVectorizer are:
    - RandomForestClassifier (accuracy: 0.948, recall: 0.955): 
    - LogisticRegression (accuracy: 0.929, recall: 0.947):
- Hashing Vectorizer together with Random Forest Generator show the best results for accuracy but slightly less recall than Logistic Regressor with Count Vetorizer.

### 6.2 N-Grams

#### LogisticRegression + CountVectorizer

In [42]:
models = [sklearn.linear_model.LogisticRegression(random_state=seed)]
ngrams = [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]
vectorizers = [sklearn.feature_extraction.text.CountVectorizer(ngram_range=ngram) for ngram in ngrams]

results = compare_vectorizers(vectorizers, models, X_train, y_train, X_test, y_test, inc_params=["ngram_range"])
display(results)

Unnamed: 0,model,fit_time,vectorizer,ngram_range,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test
0,LogisticRegression,4.858917,CountVectorizer,"(1, 1)",0.981223,0.976199,0.985389,0.980773,0.949349,0.939619,0.956693,0.948079
1,LogisticRegression,4.290006,CountVectorizer,"(1, 2)",0.998243,0.996845,0.999548,0.998195,0.952862,0.940828,0.963053,0.951811
2,LogisticRegression,3.850983,CountVectorizer,"(2, 2)",0.995352,0.991626,0.99887,0.995235,0.901186,0.864962,0.942762,0.902188
3,LogisticRegression,4.602705,CountVectorizer,"(1, 3)",0.999378,0.998721,1.0,0.99936,0.950227,0.937389,0.961236,0.949163
4,LogisticRegression,4.000456,CountVectorizer,"(2, 3)",0.99795,0.9958,1.0,0.997896,0.897087,0.856516,0.945488,0.898805
5,LogisticRegression,4.540161,CountVectorizer,"(3, 3)",0.995205,0.99023,1.0,0.995091,0.76958,0.686932,0.961538,0.801363


**Analysis**:
- Best results for LogisticRegression and CountVectorizer is use unigrams and bigrams (accuracy: 0.950, recall: 0.961)

#### RandomForestClassifier + HashingVectorizer

In [43]:
models = [sklearn.ensemble.RandomForestClassifier(random_state=seed, n_jobs=-1)]
ngrams = [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]
vectorizers = [sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=ngram) for ngram in ngrams]

result = compare_vectorizers(vectorizers, models, X_train, y_train, X_test, y_test, inc_params=["ngram_range"])

display(result)
results = pd.concat([results, result], ignore_index=True)

Unnamed: 0,model,fit_time,vectorizer,ngram_range,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test
0,RandomForestClassifier,0.982373,TfidfVectorizer,"(1, 1)",1.0,1.0,1.0,1.0,0.934417,0.92445,0.941248,0.932773
1,RandomForestClassifier,4.530415,TfidfVectorizer,"(1, 2)",1.0,1.0,1.0,1.0,0.913043,0.889528,0.936402,0.912364
2,RandomForestClassifier,6.282835,TfidfVectorizer,"(2, 2)",0.999817,1.0,0.999623,0.999812,0.805299,0.856988,0.716838,0.780673
3,RandomForestClassifier,9.813105,TfidfVectorizer,"(1, 3)",1.0,1.0,1.0,1.0,0.904699,0.885208,0.922471,0.903455
4,RandomForestClassifier,27.88311,TfidfVectorizer,"(2, 3)",0.99978,1.0,0.999548,0.999774,0.798273,0.854197,0.702604,0.77102
5,RandomForestClassifier,15.406729,TfidfVectorizer,"(3, 3)",0.998902,1.0,0.997741,0.998869,0.636364,0.90495,0.276802,0.423933


In [None]:
results.to_csv("./output/n-grams_results.csv")
display(results)

**Analysis**:
- RandomForestClassifier + TfidfVectorizer best results are with unigrams being these a bit bellow the other model (Accuracy: 0.932, Recall: 0.936)

### Model Validation - Remove more relevant words

After seeing the results of the premade models and given the best results we got were based on a LogisticRegression we assume there are some words in the dataset hevily influencing the model.

In [45]:
# Let's first train the logistic regressor
lr = sklearn.linear_model.LogisticRegression(random_state=seed)
cv = sklearn.feature_extraction.text.CountVectorizer(ngram_range = (1, 2))

mv_train_df = cv.fit_transform(X_train)
mv_test_df  = cv.transform(X_test)

lr.fit(mv_train_df, y_train)

y_train_predict = lr.predict(mv_train_df)
y_test_predict = lr.predict(mv_test_df)

result = get_classifier_metrics(y_train, y_train_predict , sub_name="_train") | get_classifier_metrics(y_test, y_test_predict , sub_name="_test") 
display(pd.DataFrame.from_dict(result, orient='index').T)

# Get the coeficients and the features from the model and vectorizer
coefs = lr.coef_[0]
features = cv.get_feature_names_out()

Unnamed: 0,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test
0,0.998243,0.996845,0.999548,0.998195,0.952862,0.940828,0.963053,0.951811


In [46]:
# Review the most influencial words for the predictions
word_coefs = pd.DataFrame(zip(features, coefs))
word_coefs.columns = ["word", "coef"]

# See the top 10 words with more coeficient
display(word_coefs.sort_values("coef", ascending=False)[0:10])

# See the top 10 words with less coeficient
display(word_coefs.sort_values("coef", ascending=True)[0:10])

Unnamed: 0,word,coef
49321,factbox,3.182219
127033,says,2.709662
157549,urges,2.001461
147551,tillerson,1.741403
82476,lawmakers,1.733302
141728,talks,1.730247
26569,china,1.727194
47074,eu,1.723963
168852,zimbabwe,1.693987
136309,spokesman,1.622523


Unnamed: 0,word,coef
158962,video,-4.307516
19445,breaking,-4.102125
60141,gop,-3.593094
66407,hillary,-3.400183
79005,just,-3.086532
117247,racist,-2.433639
38121,dem,-2.108554
69683,huge,-2.043482
167531,wow,-1.976108
18423,bombshell,-1.911474


In [47]:
# Let's try the model by removing parts of it
def remove_words(serie, words):
    return serie.apply(lambda x: ' '.join([w for w in x.split() if w not in words]))

results = []
for i in range(5, 41, 5):
    c = i / 10
    words = list(word_coefs.loc[(word_coefs["coef"] > c) | (word_coefs["coef"] < c * -1), "word"])
    y_test_predict = lr.predict(cv.transform(remove_words(X_test, words)))
    result["coef_thld_rm"] = c 
    result["num_words_rm"] = len(words)
    result["prop_words_rm"] = len(words) / word_coefs.shape[0]
    results.append(result | get_classifier_metrics(y_test, y_test_predict , sub_name="_test"))

results = pd.DataFrame(results).sort_values("coef_thld_rm", ascending=False)
display(results)

Unnamed: 0,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test,coef_thld_rm,num_words_rm,prop_words_rm
7,0.998243,0.996845,0.999548,0.998195,0.940419,0.917027,0.963961,0.939908,4.0,2,1.2e-05
6,0.998243,0.996845,0.999548,0.998195,0.937052,0.91099,0.963961,0.936727,3.5,3,1.8e-05
5,0.998243,0.996845,0.999548,0.998195,0.927536,0.894574,0.963658,0.927832,3.0,6,3.5e-05
4,0.998243,0.996845,0.999548,0.998195,0.925194,0.898828,0.952453,0.924864,2.5,7,4.1e-05
3,0.998243,0.996845,0.999548,0.998195,0.924608,0.897575,0.952756,0.924343,2.0,11,6.5e-05
2,0.998243,0.996845,0.999548,0.998195,0.907627,0.871282,0.949122,0.908537,1.5,43,0.000254
1,0.998243,0.996845,0.999548,0.998195,0.844971,0.78414,0.937311,0.853911,1.0,222,0.001313
0,0.998243,0.996845,0.999548,0.998195,0.701069,0.625498,0.950939,0.754626,0.5,1371,0.008111


**Analysis**:
- After removing the words iterativelly based on the coeficient of the words we see a decrease in accuracy (0.844 by keeping only words with coeficient inbetween -1 and 1)
- The recall stays relevant in between 0.937 and 0.963 
- This test shows that the model is 

## 7. Testing pre-made models

In [55]:

# FAKE NEWS DETECTOR
from transformers import pipeline
pipe = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection", device=0)

predictions = []
for text in train_df["corpus"]:
    predictions.append(0 if pipe(text)[0]["label"] == "LABEL_0" else 1)

results = get_classifier_metrics(train_df["label"], predictions)
results = {"model": "mrm8488/bert-tiny-finetuned-fake-news-detection"} | results
results = pd.DataFrame.from_dict(results, orient='index').T
display(results)

Device set to use cuda:0


Unnamed: 0,model,accuracy,precision,recall,f1
0,mrm8488/bert-tiny-finetuned-fake-news-detection,0.465214,0.474819,0.9576,0.634851


In [56]:
MODEL_bert = "jy46604790/Fake-News-Bert-Detect"
clf_bert = pipeline("text-classification", model=MODEL_bert, tokenizer=MODEL_bert, device=0)

text_bert = train_df["corpus"].tolist()
result_bert = clf_bert(text_bert)
predictions_bert = [0 if r["label"] == "LABEL_0" else 1 for r in result_bert]

result = get_classifier_metrics(train_df["label"], predictions_bert)
result = {"model": MODEL_bert} | result
result = pd.DataFrame.from_dict(result, orient='index').T

display(result)
results = pd.concat([results, result], ignore_index=True)

Device set to use cuda:0


Unnamed: 0,model,accuracy,precision,recall,f1
0,jy46604790/Fake-News-Bert-Detect,0.652319,0.983162,0.288782,0.446434


In [57]:
MODEL_dist = "yasmine-11/distilbert_fake_news"
clf_dist = pipeline("text-classification", model=MODEL_dist, tokenizer=MODEL_dist, device=0)

text_dist = X_train.tolist()
result_dist = clf_dist(text_dist)
predictions_dist = [0 if r["label"] == "LABEL_0" else 1 for r in result_dist]

result = get_classifier_metrics(y_train, predictions_dist)
result = {"model": MODEL_dist} | result
result = pd.DataFrame.from_dict(result, orient='index').T

display(result)
results = pd.concat([results, result], ignore_index=True)

Device set to use cuda:0


Unnamed: 0,model,accuracy,precision,recall,f1
0,yasmine-11/distilbert_fake_news,0.484426,0.485065,0.988176,0.650714


In [58]:
results.to_csv("./output/pre_trained_models.csv")
display(results)

Unnamed: 0,model,accuracy,precision,recall,f1
0,mrm8488/bert-tiny-finetuned-fake-news-detection,0.465214,0.474819,0.9576,0.634851
1,jy46604790/Fake-News-Bert-Detect,0.652319,0.983162,0.288782,0.446434
2,yasmine-11/distilbert_fake_news,0.484426,0.485065,0.988176,0.650714


### Data Cleaning Strategies

In [71]:
def less_cleaning(serie):
    # Replace apostrofo
    serie = serie.replace(r",s", "'s", regex=True, inplace=False)
    # Remove SOME characters, keeping * ' - ( ) 
    serie = serie.replace(r"[^a-zA-Z0-9*'\-().]", " ", regex=True, inplace=False)
    # Remove multiple empty spaces
    serie = serie.replace(r"\s{2,}", " ", regex=True, inplace=False)
    # Make all string lowercase
    serie = serie.str.lower()
    return serie

X2_train, X2_test, y2_train, y2_test = sklearn.model_selection.train_test_split(train_df["corpus"], train_df["label"], test_size=.2, random_state=seed)

results=[]
result = evaluate_model(model = sklearn.linear_model.LogisticRegression(random_state=seed), 
                        vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range = (1, 2)),
                        X_train    = less_cleaning(X2_train), 
                        y_train    = y2_train, 
                        X_test     = less_cleaning(X2_test), 
                        y_test     = y2_test)
result["name"] = "less_cleaning"
results.append(result)

result = evaluate_model(model      = sklearn.linear_model.LogisticRegression(random_state=seed), 
                        vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range = (1, 2)),
                        X_train    = clean_serie(X2_train), 
                        y_train    = y2_train, 
                        X_test     = clean_serie(X2_test), 
                        y_test     = y2_test)
result["name"] = "clean_serie"
results.append(result)

result = evaluate_model(model      = sklearn.linear_model.LogisticRegression(random_state=seed), 
                        vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range = (1, 2)),
                        X_train    = X2_train, 
                        y_train    = y2_train, 
                        X_test     = X2_test, 
                        y_test     = y2_test)
result["name"] = "no_cleanning"
results.append(result)

results = pd.DataFrame(results)
results.to_csv("./output/alternative_data_cleanning.csv")
display(results)

Unnamed: 0,model,fit_time,vectorizer,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test,name
0,LogisticRegression,4.653295,CountVectorizer,0.998426,0.99722,0.999548,0.998383,0.952716,0.94029,0.963356,0.951683,less_cleaning
1,LogisticRegression,4.532718,CountVectorizer,0.998243,0.996845,0.999548,0.998195,0.952862,0.940828,0.963053,0.951811,clean_serie
2,LogisticRegression,4.935427,CountVectorizer,0.998426,0.99722,0.999548,0.998383,0.952569,0.939752,0.963658,0.951555,no_cleanning


## 8. Predict labels for testing data

In [68]:
# Best Model
bmodel = sklearn.linear_model.LogisticRegression(random_state=seed)
bvect  = sklearn.feature_extraction.text.CountVectorizer(ngram_range = (1, 2))

# Train the model with all data
bm_train_ds = bvect.fit_transform(clean_serie(train_df["corpus"]))

# Fit the model
bmodel.fit(bm_train_ds, train_df["label"])

bmodel_train_predict = bmodel.predict(bm_train_ds)

result = get_classifier_metrics(train_df["label"], bmodel_train_predict , sub_name="_train")
display(pd.DataFrame.from_dict(result, orient='index').T)

Unnamed: 0,accuracy_train,precision_train,recall_train,f1_train
0,0.998536,0.997352,0.999638,0.998494


In [69]:
val_df = pd.read_csv("./dataset/testing_data.csv", sep="\t", header=None)
val_df.columns = ["label", "corpus"]

X_val = clean_serie(val_df["corpus"])
val_ds = bvect.transform(X_val)

val_predict = bmodel.predict(val_ds)

### Save the results in a new file

In [70]:
val_df["label"] = pd.Series(val_predict)
display(val_df)
val_df.to_csv("./output/testing_data.csv", sep="\t", header=False, index=False)

Unnamed: 0,label,corpus
0,0,copycat muslim terrorist arrested with assault...
1,0,wow! chicago protester caught on camera admits...
2,1,germany's fdp look to fill schaeuble's big shoes
3,0,mi school sends welcome back packet warning ki...
4,1,u.n. seeks 'massive' aid boost amid rohingya '...
...,...,...
9979,0,boom! fox news leftist chris wallace attempts ...
9980,0,here it is: list of democrat hypocrites who vo...
9981,1,new fires ravage rohingya villages in northwes...
9982,0,meals on wheels shuts the lyin‚ lefties up wit...
