# Librairy

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump
from joblib import load


# Data

In [2]:
data_path = '../data/combined_data.csv'
data = pd.read_csv(data_path)

# Modelisation

In [3]:
# Combine title and text into one feature for more contextual data
#data['full_text'] = data['title'] + " " + data['text']


In [4]:
# Selecting the features and target
features = data[['title','text', 'date']]
target = data['label']

In [5]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [6]:
# Creating a transformer for text vectorization
text_transformer_title = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english'))
])


In [7]:
text_transformer_text = Pipeline([
    ('tfidf_text', TfidfVectorizer(stop_words='english'))
])

In [8]:
# Creating a transformer for categorical data
categorical_transformer = Pipeline([
('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [12]:
# Combining transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('title', text_transformer_title, 'title'),
        ('text', text_transformer_text, 'text'),
        ('cat', categorical_transformer, ['date'])
    ])


In [13]:
# Creating the Naive Bayes classification pipeline
nb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultinomialNB())
])


In [14]:
# Fit the model
nb_pipeline.fit(X_train, y_train)

In [15]:
# Evaluate the model on the test set
y_pred = nb_pipeline.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

In [16]:
# Display the classification report and confusion matrix
report, conf_matrix

({'Fake': {'precision': 0.9536082474226805,
   'recall': 0.9761502743773744,
   'f1-score': 0.9647476011681269,
   'support': 4738.0},
  'True': {'precision': 0.9723581213307241,
   'recall': 0.9464285714285714,
   'f1-score': 0.9592181467181468,
   'support': 4200.0},
  'accuracy': 0.9621839337659431,
  'macro avg': {'precision': 0.9629831843767023,
   'recall': 0.9612894229029729,
   'f1-score': 0.9619828739431369,
   'support': 8938.0},
  'weighted avg': {'precision': 0.9624188840767175,
   'recall': 0.9621839337659431,
   'f1-score': 0.9621492896118596,
   'support': 8938.0}},
 array([[4625,  113],
        [ 225, 3975]]))

# Export

In [88]:
dump(nb_pipeline, '../api/model.joblib')


['../api/model.joblib']

In [30]:
model_loaded = load('../api/model.joblib')
