## FakeNews - BaseModel Building On Content (CountVectorizer ONLY)

## Read CSV files, drop columns, combine dataframes and pre-model preparation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

In [3]:
df = pd.read_csv("WELFake_sentiment_features.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,clean_content,clean_title,clean_text,label,titletext_blob_polarity,titletext_blob_subjectivity,title_sentiment_label,content_blob_polarity,content_subjectivity,content_sentiment_label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,0.08,0.32,1,0.034641,0.296824,1
1,1,Did they post their votes for Hillary already?,,Did they post their votes for Hillary already?,1,0.0,0.0,0,0.0,0.0,0


In [4]:
df2 = pd.read_csv("welfake_title_linguistic_analysis_normalized.csv")
df2.head(2)

Unnamed: 0,clean_title,label,num_words,num_special_char,num_sentence,liwc_results,number,verb,negate,ppron,...,motion,space,time,work,achieve,leisure,home,money,relig,death
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1,18,5,1,"{'cogmech': 0.15789473684210525, 'certain': 0....",0.0,0.0,0.0,0.0,...,0.052632,0.157895,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0
1,,1,0,0,0,{},0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df2 = df2.drop("liwc_results", axis=1)
df2.head(2)

Unnamed: 0,clean_title,label,num_words,num_special_char,num_sentence,number,verb,negate,ppron,swear,...,motion,space,time,work,achieve,leisure,home,money,relig,death
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1,18,5,1,0.0,0.0,0.0,0.0,0.0,...,0.052632,0.157895,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0
1,,1,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# combine the dataframes
df2_selected = df2.iloc[:, 2:]

# Concatenate df1 and df2_selected horizontally
combined_df = pd.concat([df, df2_selected], axis=1)

combined_df.head(2)

Unnamed: 0.1,Unnamed: 0,clean_content,clean_title,clean_text,label,titletext_blob_polarity,titletext_blob_subjectivity,title_sentiment_label,content_blob_polarity,content_subjectivity,...,motion,space,time,work,achieve,leisure,home,money,relig,death
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,0.08,0.32,1,0.034641,0.296824,...,0.052632,0.157895,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0
1,1,Did they post their votes for Hillary already?,,Did they post their votes for Hillary already?,1,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
combined_df= combined_df.drop("Unnamed: 0", axis=1)

In [7]:
#combined_df.to_csv("WELFake_combinedLA&SAfeatures.csv")

In [8]:
combined_df.head(2)

Unnamed: 0,clean_content,clean_title,clean_text,label,titletext_blob_polarity,titletext_blob_subjectivity,title_sentiment_label,content_blob_polarity,content_subjectivity,content_sentiment_label,...,motion,space,time,work,achieve,leisure,home,money,relig,death
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,0.08,0.32,1,0.034641,0.296824,1,...,0.052632,0.157895,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0
1,Did they post their votes for Hillary already?,,Did they post their votes for Hillary already?,1,0.0,0.0,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get word vectors for model building

In [9]:
#Using Tfidf on clean_title first 
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['clean_content'])

## Split the dataset into Train & Test - 80%, Test - 20% ratios

In [10]:
test_ratio = 0.2

In [11]:
from sklearn.model_selection import train_test_split
# Split the selected features and labels into training and test sets
y = combined_df['label']
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=(test_ratio), random_state=42)

## Building Random Forest base model with only word vectors (CV only)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Define the parameter grid for grid search (arbitrary numbers are selected first, based on other projects)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'max_features': ['sqrt'],
    'min_samples_split': [2, 5, 10]  # Adjusted for classification
}

# Create a GridSearchCV object
rfgrid_search_classifier = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy') 

# log the time
import time
start_time = time.time()

# Fit the grid search to your data
rfgrid_search_classifier.fit(X_train, y_train)  

# log the time
end_time = time.time()

print(f"Total runtime of the grid search is {(end_time - start_time)/60} minutes.")

# Get the best classifier from grid search
best_rf_classifier = rfgrid_search_classifier.best_estimator_


In [None]:
# Make predictions on the test data using the best classifier
RF_y_train_pred = best_rf_classifier.predict(X_train)

# Calculate classification metrics
RF_accuracy = accuracy_score(y_train, RF_y_train_pred)
RF_precision = precision_score(y_train, RF_y_train_pred)
RF_recall = recall_score(y_train, RF_y_train_pred)
RF_f1 = f1_score(y_train, RF_y_train_pred)
RF_roc_auc = roc_auc_score(y_train, best_rf_classifier.predict_proba(X_train)[:, 1])
RF_average_precision = average_precision_score(y_train, best_rf_classifier.predict_proba(X_train)[:, 1])

# Print the metrics
print("RF TRAIN Accuracy:", RF_accuracy)
print("RF TRAIN Precision:", RF_precision)
print("RF TRAIN Recall:", RF_recall)
print("RF TRAIN F1 Score:", RF_f1)
print("RF TRAIN ROC AUC:", RF_roc_auc)
print("RF TRAIN Average Precision:", RF_average_precision)

# Print the best hyperparameters found by grid search
print("Best TRAIN Hyperparameters:", rfgrid_search_classifier.best_params_)

In [None]:
# Make predictions on the test data using the best classifier
RF_y_test_pred = best_rf_classifier.predict(X_test)

# Calculate classification metrics
RF_accuracy = accuracy_score(y_test, RF_y_test_pred)
RF_precision = precision_score(y_test, RF_y_test_pred)
RF_recall = recall_score(y_test, RF_y_test_pred)
RF_f1 = f1_score(y_test, RF_y_test_pred)
RF_roc_auc = roc_auc_score(y_test, best_rf_classifier.predict_proba(X_test)[:, 1])
RF_average_precision = average_precision_score(y_test, best_rf_classifier.predict_proba(X_test)[:, 1])

# Print the metrics
print("RF TEST Accuracy:", RF_accuracy)
print("RF TEST Precision:", RF_precision)
print("RF TEST Recall:", RF_recall)
print("RF TEST F1 Score:", RF_f1)
print("RF TEST ROC AUC:", RF_roc_auc)
print("RF TEST Average Precision:", RF_average_precision)

# Print the best hyperparameters found by grid search
print("Best Hyperparameters:", rfgrid_search_classifier.best_params_)

In [None]:
# import joblib

# # Save the best Random Forest classifier model to a file
# model_filename = 'best_rf_classifier_model.pkl'
# joblib.dump(best_rf_classifier, model_filename)


In [1]:
import joblib

# Load the saved model
loaded_model = joblib.load('best_rf_classifier_modelCV.pkl')


In [29]:
# Get feature importances
feature_importances = loaded_model.feature_importances_

# Get the indices that would sort the feature importances
sorted_indices = feature_importances.argsort()[::-1]

# Get the feature names (vocabulary) corresponding to the TF-IDF features
feature_names = vectorizer.get_feature_names_out()

# Print the top 10 features, their importances, and corresponding terms
for i in range(10):
    feature_index = sorted_indices[i]
    importance = feature_importances[feature_index]
    term = feature_names[feature_index]
    print(f"RF Feature {feature_index}: Importance = {importance}, Term: {term}")

RF Feature 144234: Importance = 0.0396275754684096, Term: reuters
RF Feature 148600: Importance = 0.025205228819521988, Term: said
RF Feature 180731: Importance = 0.018501967073902573, Term: via
RF Feature 62687: Importance = 0.01708029507809903, Term: featured
RF Feature 183237: Importance = 0.01496628061846447, Term: washington
RF Feature 83088: Importance = 0.012004692643646873, Term: image
RF Feature 189096: Importance = 0.011272999060868734, Term: you
RF Feature 140495: Importance = 0.0087744336146004, Term: re
RF Feature 180896: Importance = 0.008530623714421509, Term: video
RF Feature 69953: Importance = 0.007941200748327297, Term: getty


In [28]:
# new predictions - X-test is eg. only
train_predictions = loaded_model.predict(X_train)

# Calculate classification metrics on new predictions
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)
train_RF_roc_auc = roc_auc_score(y_train, loaded_model.predict_proba(X_train)[:, 1])
train_RF_average_precision = average_precision_score(y_train, loaded_model.predict_proba(X_train)[:, 1])

# Print the metrics
print("RF TRAIN Accuracy:", train_accuracy)
print("RF TRAIN Precision:", train_precision)
print("RF TRAIN Recall:", train_recall)
print("RF TRAIN F1 Score:", train_f1)
print("RF TRAIN ROC AUC:", train_RF_roc_auc)
print("RF TRAIN Average Precision:", train_RF_average_precision)

RF TRAIN Accuracy: 0.9248849971753692
RF TRAIN Precision: 0.9668530757498729
RF TRAIN Recall: 0.8609715242881072
RF TRAIN F1 Score: 0.9108455662252448
RF TRAIN ROC AUC: 0.9865102615933201
RF TRAIN Average Precision: 0.9838438596166958


In [26]:
# Calculate classification metrics on new predictions
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
RF_roc_auc = roc_auc_score(y_test, loaded_model.predict_proba(X_test)[:, 1])
RF_average_precision = average_precision_score(y_test, loaded_model.predict_proba(X_test)[:, 1])

# Print the metrics
print("RF TEST Accuracy:", accuracy)
print("RF TEST Precision:", precision)
print("RF TEST Recall:", recall)
print("RF TEST F1 Score:", f1)
print("RF TRAIN ROC AUC:", RF_roc_auc)
print("RF TRAIN Average Precision:", RF_average_precision)



RF TEST Accuracy: 0.891534178032443
RF TEST Precision: 0.9513303049967553
RF TEST Recall: 0.7971723762914628
RF TEST F1 Score: 0.8674556213017751
RF TRAIN ROC AUC: 0.9716839990277362
RF TRAIN Average Precision: 0.9674760567854952


## Build LightGBM basemodel with only word vectors (CV only)

In [33]:
import lightgbm as lgb

# Create a LightGBM classifier
lgb_classifier = lgb.LGBMClassifier()

# Define the parameter grid for grid search (arbitrary numbers are selected first, based on other projects)
param_grid = {
    'n_estimators': [200,230],
    'max_depth': [15, 20],
    'learning_rate': [0.2, 0.3],
    'num_leaves': [20, 30]
}

# Create a GridSearchCV object
lgbgrid_search_classifier = GridSearchCV(lgb_classifier, param_grid, cv=5, scoring='accuracy')

# log the time
import time
start_time = time.time()

# Convert the input data to np.float32
X_train = X_train.astype(np.float32)

# Fit the grid search to your training data
lgbgrid_search_classifier.fit(X_train, y_train)

# log the time
end_time = time.time()

print(f"Total runtime of the grid search is {(end_time - start_time)/60} minutes.")

# Get the best classifier from grid search
best_lgb_classifier = lgbgrid_search_classifier.best_estimator_

Total runtime of the grid search is 17.22394420703252 minutes.


In [None]:
# pip install --upgrade pandas "dask[complete]"
# if encounter error when running lgbm due to dask and pandas incompatibility. uncomment above and install if needed.

In [34]:
# Make predictions on the TRAIN data using the best classifier
lgb_y_train_pred = best_lgb_classifier.predict(X_train)

# Calculate classification metrics
lgb_accuracy = accuracy_score(y_train, lgb_y_train_pred)
lgb_precision = precision_score(y_train, lgb_y_train_pred)
lgb_recall = recall_score(y_train, lgb_y_train_pred)
lgb_f1 = f1_score(y_train, lgb_y_train_pred)
lgb_roc_auc = roc_auc_score(lgb_y_train_pred, best_lgb_classifier.predict_proba(X_train)[:, 1])
lgb_average_precision = average_precision_score(lgb_y_train_pred, best_lgb_classifier.predict_proba(X_train)[:, 1])

# Print the metrics
print("LGB TRAIN Accuracy:", lgb_accuracy)
print("LGB TRAIN Precision:", lgb_precision)
print("LGB TRAIN Recall:", lgb_recall)
print("LGB TRAIN F1 Score:", lgb_f1)
print("LGB TRAIN ROC AUC:", lgb_roc_auc)
print("LGB TRAIN Average Precision:", lgb_average_precision)

# Print the best hyperparameters found by grid search
print("Best Hyperparameters:", lgbgrid_search_classifier.best_params_)

LGB TRAIN Accuracy: 1.0
LGB TRAIN Precision: 1.0
LGB TRAIN Recall: 1.0
LGB TRAIN F1 Score: 1.0
LGB TRAIN ROC AUC: 1.0
LGB TRAIN Average Precision: 1.0
Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 20, 'n_estimators': 230, 'num_leaves': 30}


In [22]:
# Convert the input data to np.float32
X_test = X_test.astype(np.float32)

# Make predictions on the TEST data using the best classifier
lgb_y_test_pred = lgbloaded_model.predict(X_test)

# Calculate classification metrics
test_accuracy = accuracy_score(y_test, lgb_y_test_pred)
test_precision = precision_score(y_test, lgb_y_test_pred)
test_recall = recall_score(y_test, lgb_y_test_pred)
test_f1 = f1_score(y_test, lgb_y_test_pred)
test_roc_auc = roc_auc_score(lgb_y_test_pred, lgbloaded_model.predict_proba(X_test)[:, 1])
test_average_precision = average_precision_score(lgb_y_test_pred, lgbloaded_model.predict_proba(X_test)[:, 1])

# Print the metrics
print("LGB TEST Accuracy:", test_accuracy)
print("LGB TEST Precision:", test_precision)
print("LGB TEST Recall:", test_recall)
print("LGB TEST F1 Score:", test_f1)
print("LGB TEST ROC AUC:", test_roc_auc)
print("LGB TEST Average Precision:", test_average_precision)

# # Print the best hyperparameters found by grid search
# print("Best Hyperparameters:", lgbgrid_search_classifier.best_params_)

LGB TEST Accuracy: 0.9817609555322412
LGB TEST Precision: 0.9786502623484712
LGB TEST Recall: 0.9804241435562806
LGB TEST F1 Score: 0.979536399855125
LGB TEST ROC AUC: 1.0
LGB TEST Average Precision: 1.0


In [14]:
import joblib

# # Save the best LightGBM classifier model to a file
# joblib.dump(best_lgb_classifier, 'best_lgb_classifier_model.pkl')

# Load the saved model
lgbloaded_model = joblib.load('best_lgb_classifier_modelCV.pkl')



In [30]:
# Get feature importances
lgbfeature_importances = lgbloaded_model.feature_importances_

# Get the indices that would sort the feature importances
lgbsorted_indices = lgbfeature_importances.argsort()[::-1]

# Get the feature names (vocabulary) corresponding to the CountVectorizer features
lgbfeature_names = vectorizer.get_feature_names_out()

# Print the top 10 features, their importances, and corresponding terms
for i in range(10):
    lgbfeature_index = lgbsorted_indices[i]
    lgbimportance = lgbfeature_importances[lgbfeature_index]
    lgbterm = lgbfeature_names[lgbfeature_index]
    print(f"Feature {lgbfeature_index}: Importance = {lgbimportance}, Term: {lgbterm}")

Feature 86791: Importance = 117, Term: is
Feature 148600: Importance = 83, Term: said
Feature 144234: Importance = 78, Term: reuters
Feature 29525: Importance = 70, Term: breitbart
Feature 180731: Importance = 69, Term: via
Feature 2584: Importance = 53, Term: 2016
Feature 189040: Importance = 52, Term: york
Feature 78799: Importance = 51, Term: hillary
Feature 178779: Importance = 51, Term: us
Feature 169085: Importance = 40, Term: the
