<a href="https://colab.research.google.com/github/HenriettePlane/fake_or_real/blob/main/models_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import dataset

In [1]:
# Mounting drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Import dataset
# If needed, please adjust the path according to your Google Drive folder path
file_path = '/content/drive/MyDrive/Colab Notebooks/preprocessed_df.csv'

# Check if file exists then read the CSV file
try:
  with open(file_path, 'r') as file:
    print('File opened successfully!')

    # Load the file into a DataFrame and set 'Unnamed: 0' column as the index
    df = pd.read_csv(file_path, index_col=0)

    # Display the DataFrame info
    print(df.info())

except FileNotFoundError:
  print('File not found.')

except Exception as e:
  print(f'An error occurred: {e}')

File opened successfully!
<class 'pandas.core.frame.DataFrame'>
Index: 44182 entries, 0 to 44181
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   subject                          44182 non-null  object 
 1   label                            44182 non-null  int64  
 2   contractions missing apostrophe  44182 non-null  int64  
 3   url counts                       44182 non-null  int64  
 4   quote pairs                      44182 non-null  int64  
 5   cleaned text                     44182 non-null  object 
 6   sentiment                        44182 non-null  float64
 7   flesch_reading_ease              44182 non-null  float64
dtypes: float64(2), int64(4), object(2)
memory usage: 3.0+ MB
None


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Note:
* label: 1 is Fake news, 0 is True news
* contraction missing apostrophe: 1 is missing, 0 is not missing
* sentiment (compound_score): Range from -1 to 1. Negative score indicates negativity sentiment and vice versa, 0 is neutral.
* flesch_reading_ease: There is no limit on how low the score can be. A negative score is valid. Higher values indicate easier readability.  

In [4]:
# Preview df
df.head(10)

Unnamed: 0,subject,label,contractions missing apostrophe,url counts,quote pairs,cleaned text,sentiment,flesch_reading_ease
0,News,1,1,0,0,donald trump wish american happy new year leav...,0.9678,43.69
1,News,1,1,0,0,house intelligence committee chairman devin nu...,0.63,18.42
2,News,1,1,1,0,friday revealed former milwaukee sheriff david...,-0.9933,44.0
3,News,1,1,4,0,christmas day donald trump announced would bac...,0.1316,33.92
4,News,1,0,0,0,pope francis used annual christmas day message...,0.9105,43.29
5,News,1,0,0,0,number case cop brutalizing killing people col...,-0.9899,44.51
6,News,1,1,0,0,donald trump spent good portion day golf club ...,-0.4966,25.86
7,News,1,1,0,0,wake yet another court decision derailed donal...,-0.9827,22.21
8,News,1,0,0,0,many people raised alarm regarding fact donald...,0.8807,4.68
9,News,1,1,0,0,might thought get break watching people kiss d...,0.977,36.15


In [5]:
# drop the subject column
df = df.iloc[:,1:]
print(df.head(10))

   label  contractions missing apostrophe  url counts  quote pairs  \
0      1                                1           0            0   
1      1                                1           0            0   
2      1                                1           1            0   
3      1                                1           4            0   
4      1                                0           0            0   
5      1                                0           0            0   
6      1                                1           0            0   
7      1                                1           0            0   
8      1                                0           0            0   
9      1                                1           0            0   

                                        cleaned text  sentiment  \
0  donald trump wish american happy new year leav...     0.9678   
1  house intelligence committee chairman devin nu...     0.6300   
2  friday revealed former mi

# Create pipeline

In [6]:
# Import tools used for creating pipeline using sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### Preprocessing data
Assign different steps to preprocess different data types

In [None]:
# This cell is commented out because the only nominal column (subject) is dropped
## Initialize the pipeline for Nominal features (Categorical)
# nom_cat_transformer = Pipeline([
#                         ('imputer', SimpleImputer(strategy='most_frequent')),
#                         ('encoder', OneHotEncoder(handle_unknown='ignore'))

# ])
# # Assign feature as column names
# nominal_features = ['subject']

# nom_cat_transformer

In [7]:
# Initialize the pipeline for Numerical features (Numeric)
numeric_transformer = Pipeline([
                        ("scaler", StandardScaler())
])
# Assign feature as column names
numeric_features = ['contractions missing apostrophe', 'url counts', 'quote pairs', 'sentiment', 'flesch_reading_ease']

numeric_transformer

In [8]:
# Import TfidfVectorizer for TF-IDF task
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the pipeline for Text feature (Textual)
text_transformer = TfidfVectorizer(min_df=10,   # Ignore words that appear in fewer than 10 articles
                                max_df=.9,    # Ignore words that appear in more than 90% documents
                                max_features=1000,  # Limit vocabulary size
                                ngram_range=(1,2),  # Use unigrams and bigrams
                                token_pattern=r'\b[a-zA-Z]{3,}\b')   # Keep only words with at least 3 letters

# Assign feature as column names
text_features = 'cleaned text'

### Split dataset into training and testing set

In [9]:
# Shuffle the dataset to ensure dataset is mixed well
df = shuffle(df, random_state=42)

# Define features and target variable
X = df.drop(columns=['label']) # Features
y = df['label'] # Target variable

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42)

### Build the ColumnsTransformer

In [10]:
# Combine all transformers into a single preprocessor
# At this step, the pipeline is just defined but not yet applied to any data.

preprocessor = ColumnTransformer([
              # ('nom_pipe', nom_cat_transformer, nominal_features),
              ('numeric_pipe', numeric_transformer, numeric_features),
              ('text_pipe', text_transformer, text_features)
])

preprocessor

### Fit and Transform the Training and Test Sets

In [11]:
# Fit only on training data, then transform both sets
# At this step, we actually run the pipeline on the data

X_train_transformed = preprocessor.fit_transform(X_train) # fit & transform on X_train
X_test_transformed = preprocessor.transform(X_test) # transform only on X_test to prevent data leakage

# Build the Classifier Models

# Naive Bayes

In [None]:
# Creating a Naive Bayes Model
# credit to the tutorial from https://www.datacamp.com/datalab/w/204594a4-4343-4ffa-be7f-7f61700fc18d/edit for the correct steps to take
# import the model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# fit the model on the test and train set, data is sparse with a vocab of 10000 words so using 'toarray()' to transform it into a dense numpy array
gnb.fit(X_train_transformed.toarray(), y_train)

In [None]:
# creating predictions on the test set
y_pred = gnb.predict(X_test_transformed.toarray())

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print('Precision:',metrics.precision_score(y_test,y_pred))
print('Recall:',metrics.recall_score(y_test,y_pred))
print('F1:', metrics.f1_score(y_test,y_pred))

# suspiciously good metrics when using a vocab of 10k,
# my guess is that the model is severely overfitting with the currently large number of features

Accuracy: 0.9395722530270454
Precision: 0.9510002247696111
Recall: 0.9305036287662195
F1: 0.9406402845709204


In [None]:
# tuning hyperparemeters: variant smoothing (credit to: https://www.restack.io/p/hyperparameter-tuning-answer-naive-bayes-sklearn-cat-ai)
# import the library
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]}

# Set up the grid search
grid_search = GridSearchCV(gnb, param_grid, cv=5)

# Fit the model
grid_search.fit(X_train_transformed.toarray(), y_train)

# Best parameters
print(grid_search.best_params_)


{'var_smoothing': 1e-09}


In [None]:
# tuning hyperparameters: prior probabilities
# Define the prior grid
param_grid = {'priors': [[0.1, 0.9], [0.3, 0.7], [0.5, 0.5], [0.7, 0.3], [0.9, 0.1]]}

# Set up GridSearchCV
grid = GridSearchCV(gnb, param_grid, cv=5, scoring='f1')

# Fit GridSearchCV
grid.fit(X_train_transformed.toarray(), y_train)

# Retrieve results
best_model = grid.best_estimator_
best_priors = grid.best_params_
print("Best priors:", best_priors)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_transformed.toarray())
# accuracy = accuracy_score(y_test, y_pred)
# print("Test accuracy:", accuracy)
f1 = metrics.f1_score(y_test, y_pred)
print("Test f1:", f1)

Best priors: {'priors': [0.3, 0.7]}
Test f1: 0.9404312069348744


# Support Vector machine

In [None]:
# Credit to the tutorial from https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python for the correct steps to take
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train_transformed, y_train)

In [None]:
#Predict the response for test dataset
y_pred = clf.predict(X_test_transformed)

In [None]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))
print('F1:', metrics.f1_score(y_test,y_pred))

# also has suspiciously perfect scores when using a 10k vocab due to overfitting when we have many sparse dimensions
# two options: reduce the size of the vocabulary (and thus the dimensions) or tune hyperparameters

Accuracy: 0.9927577232092339
Precision: 0.9951402694941462
Recall: 0.990763140532219
F1: 0.9929468811990302


In [None]:
# tuning hyperparameters
# Define the parameter grid: try different kernels, C values, and gamma options
param_grid = {
    'kernel': ['rbf', 'linear'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

# Set up GridSearchCV
grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1')

# Fit GridSearchCV
grid.fit(X_train_transformed, y_train)

# Decision Tree

In [13]:
# credit to the tutorial from https://www.datacamp.com/tutorial/decision-tree-classification-python
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train_transformed,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test_transformed)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print('F1:', metrics.f1_score(y_test,y_pred))

# like the other two models this one is suspiciously perfect, probably due to overfitting on sparse data

Accuracy: 0.9946814529817811
Precision: 0.9933229468061429
Recall: 0.9962053571428572
F1: 0.9947620639696868


In [None]:
# Hyperparameter tuning
# Define the parameter grid to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_transformed, y_train)

# Output the best parameters and best cross-validation score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate the best estimator on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_transformed)

# Final Model

In [14]:
# Create the Decision Tree classifer object with the winning parameters
clf = DecisionTreeClassifier(criterion='entropy',max_depth=10,min_samples_split=2,min_samples_leaf=1)

# Train the Decision Tree Classifer
clf = clf.fit(X_train_transformed,y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test_transformed)

In [15]:
# calculate and print quality metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print('F1:', metrics.f1_score(y_test,y_pred))

Accuracy: 0.9962657010297612
Precision: 0.9966989436619719
Recall: 0.9960413459423796
F1: 0.996370036299637
