## FakeNews - Model Building On Content (TD-IDF & Sentiment Features)

## Read CSV files, drop columns, combine dataframes and pre-model preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

In [2]:
df = pd.read_csv("WELFake_sentiment_features.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,clean_content,clean_title,clean_text,label,titletext_blob_polarity,titletext_blob_subjectivity,title_sentiment_label,content_blob_polarity,content_subjectivity,content_sentiment_label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,0.08,0.32,1,0.034641,0.296824,1
1,1,Did they post their votes for Hillary already?,,Did they post their votes for Hillary already?,1,0.0,0.0,0,0.0,0.0,0


In [3]:
df2 = pd.read_csv("welfake_content_linguistic_analysis_normalized.csv")
df2.head(2)

Unnamed: 0,clean_content,label,num_words,num_special_char,num_sentence,liwc_results,number,ppron,swear,social,...,time,work,achieve,leisure,home,money,relig,death,verb,negate
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1,889,187,47,"{'cogmech': 0.10741138560687433, 'certain': 0....",0.017186,0.085929,0.006445,0.167562,...,0.044039,0.009667,0.011815,0.019334,0.001074,0.001074,0.001074,0.017186,0.121375,0.001074
1,Did they post their votes for Hillary already?,1,8,1,1,"{'verb': 0.125, 'funct': 0.5, 'auxverb': 0.125...",0.0,0.25,0.0,0.25,...,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0


In [4]:
df2 = df2.drop("liwc_results", axis=1)
df2.head(2)

Unnamed: 0,clean_content,label,num_words,num_special_char,num_sentence,number,ppron,swear,social,family,...,time,work,achieve,leisure,home,money,relig,death,verb,negate
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1,889,187,47,0.017186,0.085929,0.006445,0.167562,0.007519,...,0.044039,0.009667,0.011815,0.019334,0.001074,0.001074,0.001074,0.017186,0.121375,0.001074
1,Did they post their votes for Hillary already?,1,8,1,1,0.0,0.25,0.0,0.25,0.0,...,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0


In [5]:
# combine the dataframes
df2_selected = df2.iloc[:, 2:]

# Concatenate df1 and df2_selected horizontally
combined_df = pd.concat([df, df2_selected], axis=1)

combined_df.head(2)

Unnamed: 0.1,Unnamed: 0,clean_content,clean_title,clean_text,label,titletext_blob_polarity,titletext_blob_subjectivity,title_sentiment_label,content_blob_polarity,content_subjectivity,...,time,work,achieve,leisure,home,money,relig,death,verb,negate
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,0.08,0.32,1,0.034641,0.296824,...,0.044039,0.009667,0.011815,0.019334,0.001074,0.001074,0.001074,0.017186,0.121375,0.001074
1,1,Did they post their votes for Hillary already?,,Did they post their votes for Hillary already?,1,0.0,0.0,0,0.0,0.0,...,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0


In [6]:
combined_df= combined_df.drop("Unnamed: 0", axis=1)

In [7]:
#combined_df.to_csv("WELFake_combinedLA&SAfeatures.csv")

In [7]:
combined_df.head(2)

Unnamed: 0,clean_content,clean_title,clean_text,label,titletext_blob_polarity,titletext_blob_subjectivity,title_sentiment_label,content_blob_polarity,content_subjectivity,content_sentiment_label,...,time,work,achieve,leisure,home,money,relig,death,verb,negate
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,0.08,0.32,1,0.034641,0.296824,1,...,0.044039,0.009667,0.011815,0.019334,0.001074,0.001074,0.001074,0.017186,0.121375,0.001074
1,Did they post their votes for Hillary already?,,Did they post their votes for Hillary already?,1,0.0,0.0,0,0.0,0.0,0,...,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0


## Get word vectors for model building

In [8]:
#Using Tfidf on clean_title first 
from sklearn.feature_extraction.text import TfidfVectorizer

# # log the time
# import time
# start_time = time.time()

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(combined_df['clean_content'])

# # log the time
# end_time = time.time()

# print(f"Total runtime of the vectoriser is {(end_time - start_time)/60} minutes.")

In [9]:
## Combining with Sentiment Analysis features éxtracted from 'content' column
from scipy.sparse import hstack
sentiment_feat = combined_df.iloc[:,7:10]
sentiment_feat.head(2)

Unnamed: 0,content_blob_polarity,content_subjectivity,content_sentiment_label
0,0.034641,0.296824,1
1,0.0,0.0,0


## Combine the TF-IDF and Sentiment Features (SF)

In [10]:
X_SF = hstack([X, sentiment_feat])
X_SF

<61955x192117 sparse matrix of type '<class 'numpy.float64'>'
	with 16460600 stored elements in COOrdinate format>

In [11]:
# Convert X_LF to a csr_matrix
X_SF_csr = X_SF.tocsr()

# Split X_LF into two parts: one containing TF-IDF features and the other containing linguistic features
tfidf_part = X_SF_csr[:, :X.shape[1]]
sentiment_features = X_SF_csr[:, X.shape[1]:]

## Split the dataset into Train & Test - 80%, Test - 20% ratios

In [12]:
test_ratio = 0.2

In [13]:
from sklearn.model_selection import train_test_split
# Split the selected features and labels into training and test sets
y = combined_df['label']
X_train, X_test, y_train, y_test = train_test_split(X_SF, y, test_size=(test_ratio), random_state=42)

## Building Random Forest model with TF-IDF & Sentiment Features

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Define the parameter grid for grid search (arbitrary numbers are selected first, based on other projects)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'max_features': ['sqrt'],
    'min_samples_split': [2, 5, 10]  # Adjusted for classification
}

# Create a GridSearchCV object
rfgrid_search_classifier = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy') 

# log the time
import time
start_time = time.time()

# Fit the grid search to your data
rfgrid_search_classifier.fit(X_train, y_train)  

# log the time
end_time = time.time()

print(f"Total runtime of the grid search is {(end_time - start_time)/60} minutes.")

# Get the best classifier from grid search
best_rf_classifier = rfgrid_search_classifier.best_estimator_


Total runtime of the grid search is 31.741450122992198 minutes.


In [19]:
# Make predictions on the test data using the best classifier
RF_y_train_pred = best_rf_classifier.predict(X_train)

# Calculate classification metrics
RF_accuracy = accuracy_score(y_train, RF_y_train_pred)
RF_precision = precision_score(y_train, RF_y_train_pred)
RF_recall = recall_score(y_train, RF_y_train_pred)
RF_f1 = f1_score(y_train, RF_y_train_pred)
RF_roc_auc = roc_auc_score(y_train, best_rf_classifier.predict_proba(X_train)[:, 1])
RF_average_precision = average_precision_score(y_train, best_rf_classifier.predict_proba(X_train)[:, 1])

# Print the metrics
print("RF TRAIN Accuracy:", RF_accuracy)
print("RF TRAIN Precision:", RF_precision)
print("RF TRAIN Recall:", RF_recall)
print("RF TRAIN F1 Score:", RF_f1)
print("RF TRAIN ROC AUC:", RF_roc_auc)
print("RF TRAIN Average Precision:", RF_average_precision)

# Print the best hyperparameters found by grid search
print("Best TRAIN Hyperparameters:", rfgrid_search_classifier.best_params_)

RF TRAIN Accuracy: 0.913647001856186
RF TRAIN Precision: 0.9691270217586007
RF TRAIN Recall: 0.8327674408076419
RF TRAIN F1 Score: 0.8957876795714634
RF TRAIN ROC AUC: 0.9845215782904225
RF TRAIN Average Precision: 0.9816756631062727
Best TRAIN Hyperparameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}


In [20]:
# Make predictions on the test data using the best classifier
RF_y_test_pred = best_rf_classifier.predict(X_test)

# Calculate classification metrics
RF_accuracy = accuracy_score(y_test, RF_y_test_pred)
RF_precision = precision_score(y_test, RF_y_test_pred)
RF_recall = recall_score(y_test, RF_y_test_pred)
RF_f1 = f1_score(y_test, RF_y_test_pred)
RF_roc_auc = roc_auc_score(y_test, best_rf_classifier.predict_proba(X_test)[:, 1])
RF_average_precision = average_precision_score(y_test, best_rf_classifier.predict_proba(X_test)[:, 1])

# Print the metrics
print("RF TEST Accuracy:", RF_accuracy)
print("RF TEST Precision:", RF_precision)
print("RF TEST Recall:", RF_recall)
print("RF TEST F1 Score:", RF_f1)
print("RF TEST ROC AUC:", RF_roc_auc)
print("RF TEST Average Precision:", RF_average_precision)

# Print the best hyperparameters found by grid search
print("Best Hyperparameters:", rfgrid_search_classifier.best_params_)

RF TEST Accuracy: 0.8713478611783696
RF TEST Precision: 0.9361888111888111
RF TEST Recall: 0.7669172932330827
RF TEST F1 Score: 0.843141113953946
RF TEST ROC AUC: 0.9593736377561018
RF TEST Average Precision: 0.9531781596066293
Best Hyperparameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}


In [15]:
from sklearn.ensemble import RandomForestClassifier

# Define the best parameters
best_params = {
    'n_estimators': 200,
    'max_depth': 15,
    'max_features': 'sqrt',
    'min_samples_split': 2
}

# Create a new RandomForestClassifier with the best parameters
best_rf_classifier = RandomForestClassifier(**best_params)

# log the time
import time
start_time = time.time()

# Fit the model to training data
best_rf_classifier.fit(X_train, y_train)

# log the time
end_time = time.time()

print(f"Total runtime of training the model is {(end_time - start_time)/60} minutes.")

Total runtime of training the model is 0.9795362750689188 minutes.


In [19]:
# Get feature importances
feature_importances = best_rf_classifier.feature_importances_

# Get the indices that would sort the feature importances
sorted_indices = feature_importances.argsort()[::-1]

# Define the number of top features you want to retrieve
top_n = 10

# Get the feature names (vocabulary) corresponding to the TF-IDF features
feature_names = tfidf_vectorizer.get_feature_names_out() 

# Access the known linguistic feature names from your DataFrame
sentiment_feature_names = list(sentiment_feat.columns)

# Print the top 20 features, their importances, and corresponding terms
for i in range(top_n):
    feature_index = sorted_indices[i]
    importance = feature_importances[feature_index]
    
    # Determine if the feature is from TF-IDF or linguistic features
    if feature_index < X.shape[1]:
        term = feature_names[feature_index]
        feature_type = "TF-IDF"
    else:
        term = sentiment_feature_names[feature_index - X.shape[1]]
        feature_type = "Sentiment"
    
    print(f"Feature {feature_index}: Importance = {importance}, Type: {feature_type}, Term: {term}")

Feature 148600: Importance = 0.031118232576071877, Type: TF-IDF, Term: said
Feature 144234: Importance = 0.021202841411258357, Type: TF-IDF, Term: reuters
Feature 180731: Importance = 0.015141207357168698, Type: TF-IDF, Term: via
Feature 62687: Importance = 0.015059166196010623, Type: TF-IDF, Term: featured
Feature 83088: Importance = 0.012425107246414798, Type: TF-IDF, Term: image
Feature 124102: Importance = 0.011035994265768394, Type: TF-IDF, Term: on
Feature 83098: Importance = 0.009564039634126569, Type: TF-IDF, Term: images
Feature 169849: Importance = 0.008949749585721166, Type: TF-IDF, Term: this
Feature 183322: Importance = 0.008908385156204443, Type: TF-IDF, Term: watch
Feature 192115: Importance = 0.008273361735319071, Type: Sentiment, Term: content_subjectivity


## Save/load best RF model

In [16]:
import joblib

# # Save the best LightGBM classifier model to a file
# joblib.dump(best_rf_classifier, 'best_rf_classifier_modelTF-IDF_SF.pkl')

# Load the saved model
best_rf_classifier = joblib.load('best_rf_classifier_modelTF-IDF_SF.pkl')

['best_rf_classifier_modelTF-IDF_SF.pkl']

## Build LightGBM basemodel with TF-IDF & Sentiment Features

In [22]:
import lightgbm as lgb
# Create a LightGBM classifier
lgb_classifier = lgb.LGBMClassifier()

# Define the parameter grid for grid search (arbitrary numbers are selected first, based on other projects)
param_grid = {
    'n_estimators': [200,230],
    'max_depth': [15, 20],
    'learning_rate': [0.2, 0.3],
    'num_leaves': [20, 30]
}

# Create a GridSearchCV object
lgbgrid_search_classifier = GridSearchCV(lgb_classifier, param_grid, cv=5, scoring='accuracy')

# log the time
import time
start_time = time.time()

# Fit the grid search to your training data
lgbgrid_search_classifier.fit(X_train, y_train)

# log the time
end_time = time.time()

print(f"Total runtime of the grid search is {(end_time - start_time)/60} minutes.")

# Get the best classifier from grid search
best_lgb_classifier = lgbgrid_search_classifier.best_estimator_

Total runtime of the grid search is 114.76828529040019 minutes.


In [23]:
# pip install --upgrade pandas "dask[complete]"
# if encounter error when running lgbm due to dask and pandas incompatibility. uncomment above and install if needed.

In [24]:
# Make predictions on the TRAIN data using the best classifier
lgb_y_train_pred = best_lgb_classifier.predict(X_train)

# Calculate classification metrics
lgb_accuracy = accuracy_score(y_train, lgb_y_train_pred)
lgb_precision = precision_score(y_train, lgb_y_train_pred)
lgb_recall = recall_score(y_train, lgb_y_train_pred)
lgb_f1 = f1_score(y_train, lgb_y_train_pred)
lgb_roc_auc = roc_auc_score(lgb_y_train_pred, best_lgb_classifier.predict_proba(X_train)[:, 1])
lgb_average_precision = average_precision_score(lgb_y_train_pred, best_lgb_classifier.predict_proba(X_train)[:, 1])

# Print the metrics
print("LGB TRAIN Accuracy:", lgb_accuracy)
print("LGB TRAIN Precision:", lgb_precision)
print("LGB TRAIN Recall:", lgb_recall)
print("LGB TRAIN F1 Score:", lgb_f1)
print("LGB TRAIN ROC AUC:", lgb_roc_auc)
print("LGB TRAIN Average Precision:", lgb_average_precision)

# Print the best hyperparameters found by grid search
print("Best Hyperparameters:", lgbgrid_search_classifier.best_params_)

LGB TRAIN Accuracy: 1.0
LGB TRAIN Precision: 1.0
LGB TRAIN Recall: 1.0
LGB TRAIN F1 Score: 1.0
LGB TRAIN ROC AUC: 1.0
LGB TRAIN Average Precision: 1.0
Best Hyperparameters: {'learning_rate': 0.3, 'max_depth': 20, 'n_estimators': 230, 'num_leaves': 30}


In [25]:
# Make predictions on the TEST data using the best classifier
lgb_y_test_pred = best_lgb_classifier.predict(X_test)

# Calculate classification metrics
test_accuracy = accuracy_score(y_test, lgb_y_test_pred)
test_precision = precision_score(y_test, lgb_y_test_pred)
test_recall = recall_score(y_test, lgb_y_test_pred)
test_f1 = f1_score(y_test, lgb_y_test_pred)
test_roc_auc = roc_auc_score(lgb_y_test_pred, best_lgb_classifier.predict_proba(X_test)[:, 1])
test_average_precision = average_precision_score(lgb_y_test_pred, best_lgb_classifier.predict_proba(X_test)[:, 1])

# Print the metrics
print("LGB TEST Accuracy:", test_accuracy)
print("LGB TEST Precision:", test_precision)
print("LGB TEST Recall:", test_recall)
print("LGB TEST F1 Score:", test_f1)
print("LGB TEST ROC AUC:", test_roc_auc)
print("LGB TEST Average Precision:", test_average_precision)

# Print the best hyperparameters found by grid search
print("Best Hyperparameters:", lgbgrid_search_classifier.best_params_)

LGB TEST Accuracy: 0.9785310734463277
LGB TEST Precision: 0.9777298850574713
LGB TEST Recall: 0.974579305406373
LGB TEST F1 Score: 0.97615205307513
LGB TEST ROC AUC: 1.0
LGB TEST Average Precision: 1.0
Best Hyperparameters: {'learning_rate': 0.3, 'max_depth': 20, 'n_estimators': 230, 'num_leaves': 30}


In [20]:
import lightgbm as lgb
# Create a LightGBM classifier
lgb_classifier = lgb.LGBMClassifier()

# Define the parameter grid for grid search (arbitrary numbers are selected first, based on other projects)
param_grid = {
    'n_estimators': 230,
    'max_depth': 20,
    'learning_rate': 0.3,
    'num_leaves': 30
}

# log the time
import time
start_time = time.time()

# Fit the grid search to your training data
lgb_classifier.fit(X_train, y_train)

# log the time
end_time = time.time()

print(f"Total runtime of the training is {(end_time - start_time)/60} minutes.")

Total runtime of the training is 1.172638463973999 minutes.


In [22]:
# Get feature importances
feature_importances = lgb_classifier.feature_importances_

# Get the indices that would sort the feature importances
sorted_indices = feature_importances.argsort()[::-1]

# Define the number of top features you want to retrieve
top_n = 10

# Get the feature names (vocabulary) corresponding to the TF-IDF features
feature_names = tfidf_vectorizer.get_feature_names_out() 

# Access the known linguistic feature names from your DataFrame
sentiment_feature_names = list(sentiment_feat.columns)

# Print the top 20 features, their importances, and corresponding terms
for i in range(top_n):
    feature_index = sorted_indices[i]
    importance = feature_importances[feature_index]
    
    # Determine if the feature is from TF-IDF or linguistic features
    if feature_index < X.shape[1]:
        term = feature_names[feature_index]
        feature_type = "TF-IDF"
    else:
        term = sentiment_feature_names[feature_index - X.shape[1]]
        feature_type = "Sentiment"
    
    print(f"Feature {feature_index}: Importance = {importance}, Type: {feature_type}, Term: {term}")

Feature 86791: Importance = 92, Type: TF-IDF, Term: is
Feature 180731: Importance = 85, Type: TF-IDF, Term: via
Feature 144234: Importance = 72, Type: TF-IDF, Term: reuters
Feature 29525: Importance = 69, Type: TF-IDF, Term: breitbart
Feature 148600: Importance = 67, Type: TF-IDF, Term: said
Feature 78799: Importance = 62, Type: TF-IDF, Term: hillary
Feature 178779: Importance = 59, Type: TF-IDF, Term: us
Feature 2584: Importance = 58, Type: TF-IDF, Term: 2016
Feature 65247: Importance = 46, Type: TF-IDF, Term: follow
Feature 123075: Importance = 46, Type: TF-IDF, Term: october


## Save/load Best LGBM model

In [21]:
import joblib

# # Save the best LightGBM classifier model to a file
# joblib.dump(lgb_classifier, 'lgb_classifier_modelTF-IDF_SF.pkl')

# Load the saved model
lgb_classifier = joblib.load('lgb_classifier_modelTF-IDF_SF.pkl')

['lgb_classifier_modelTF-IDF_SF.pkl']