# Librairy

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump
from joblib import load


# Data

In [61]:
data_path = '../data/combined_data.csv'
data = pd.read_csv(data_path)

# Modelisation

In [62]:
# Combine title and text into one feature for more contextual data
#data['full_text'] = data['title'] + " " + data['text']


In [63]:
# Selecting the features and target
features = data[['title','text', 'subject', 'date']]
target = data['label']

In [64]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [80]:
# Creating a transformer for text vectorization
text_transformer_title = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english'))
])


In [81]:
text_transformer_text = Pipeline([
    ('tfidf_text', TfidfVectorizer(stop_words='english'))
])

In [82]:
# Creating a transformer for categorical data
categorical_transformer = Pipeline([
('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [83]:
# Combining transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('title', text_transformer_title, 'title'),
        ('text', text_transformer_text, 'text'),
        ('cat', categorical_transformer, ['subject', 'date'])
    ])


In [84]:
# Creating the Naive Bayes classification pipeline
nb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultinomialNB())
])


In [85]:
# Fit the model
nb_pipeline.fit(X_train, y_train)

In [86]:
# Evaluate the model on the test set
y_pred = nb_pipeline.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

In [87]:
# Display the classification report and confusion matrix
report, conf_matrix

({'Fake': {'precision': 1.0,
   'recall': 0.99852258336851,
   'f1-score': 0.9992607455908755,
   'support': 4738.0},
  'True': {'precision': 0.9983361064891847,
   'recall': 1.0,
   'f1-score': 0.9991673605328892,
   'support': 4200.0},
  'accuracy': 0.9992168270306556,
  'macro avg': {'precision': 0.9991680532445923,
   'recall': 0.9992612916842549,
   'f1-score': 0.9992140530618824,
   'support': 8938.0},
  'weighted avg': {'precision': 0.9992181301470772,
   'recall': 0.9992168270306556,
   'f1-score': 0.9992168635989822,
   'support': 8938.0}},
 array([[4731,    7],
        [   0, 4200]]))

# Export

In [88]:
dump(nb_pipeline, '../api/model.joblib')


['../api/model.joblib']

In [30]:
model_loaded = load('../api/model.joblib')
