# Preprocessing

## Load and preprocess data
The model will be trained and make predictions based on the articles' titles and summaries

In [1]:
import pandas as pd

data = pd.read_csv('bbc_world_train.csv')
data['Text'] = data['Title'] + ' ' + data['Summary']
data['Text'] = data['Text'].str.lower()

## Vectorize data

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(data['Text'])
y = data['Interesting']

## Split data

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model performance comparison
Compare different models (logistic regression, random forest, gradient boosting tree, support vector classifier, multilayer perceptron) to see which one has the best performance
## Without grid search

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Classifier': SVC(),
    'Multilayer Perceptron': MLPClassifier()
}

results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.2f}")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1-Score: {metrics['F1-Score']:.2f}")
    print("\n")


Model: Logistic Regression
Accuracy: 0.79
Precision: 0.69
Recall: 0.74
F1-Score: 0.72


Model: Random Forest
Accuracy: 0.78
Precision: 0.72
Recall: 0.66
F1-Score: 0.68


Model: Gradient Boosting
Accuracy: 0.76
Precision: 0.67
Recall: 0.67
F1-Score: 0.67


Model: Support Vector Classifier
Accuracy: 0.82
Precision: 0.75
Recall: 0.74
F1-Score: 0.75


Model: Multilayer Perceptron
Accuracy: 0.76
Precision: 0.64
Recall: 0.76
F1-Score: 0.69




## With grid search

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline


models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {'C': [0.1, 1.0, 10.0], 'solver': ['liblinear', 'lbfgs']}
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(),
        'params': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}
    },
    'Support Vector Classifier': {
        'model': Pipeline([
            ('scaler', MaxAbsScaler()),
            ('svc', SVC())
        ]),
        'params': {'svc__C': [0.1, 1.0, 10.0], 'svc__gamma': [0.1, 1.0, 10.0]}
    },
    'Multilayer Perceptron': {
        'model': Pipeline([
            ('scaler', StandardScaler(with_mean=False)),
            ('mlp', MLPClassifier())
        ]),
        'params': {'mlp__hidden_layer_sizes': [(50, 50), (100, 100), (50, 100)], 'mlp__alpha': [0.0001, 0.001, 0.01]}
    }
}

results = {}

for model_name, model_info in models.items():
    model = model_info['model']
    param_grid = model_info['params']
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', verbose=0, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator (model with the best parameters)
    best_model = grid_search.best_estimator_
    
    # Evaluate the best model
    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[model_name] = {
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Print the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {metrics['Best Parameters']}")
    print(f"Accuracy: {metrics['Accuracy']:.2f}")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1-Score: {metrics['F1-Score']:.2f}")
    print("\n")


Model: Logistic Regression
Best Parameters: {'C': 10.0, 'solver': 'liblinear'}
Accuracy: 0.80
Precision: 0.69
Recall: 0.81
F1-Score: 0.75


Model: Random Forest
Best Parameters: {'max_depth': None, 'n_estimators': 200}
Accuracy: 0.79
Precision: 0.71
Recall: 0.69
F1-Score: 0.70


Model: Gradient Boosting
Best Parameters: {'learning_rate': 0.2, 'n_estimators': 150}
Accuracy: 0.77
Precision: 0.67
Recall: 0.71
F1-Score: 0.69


Model: Support Vector Classifier
Best Parameters: {'svc__C': 10.0, 'svc__gamma': 0.1}
Accuracy: 0.77
Precision: 0.67
Recall: 0.72
F1-Score: 0.69


Model: Multilayer Perceptron
Best Parameters: {'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (100, 100)}
Accuracy: 0.78
Precision: 0.67
Recall: 0.74
F1-Score: 0.70




# Train the model
Because logistic regression is the one that has the best performance, train it using the full training set

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=10, solver='liblinear').fit(X, y)

# Save the model and the vectorizer to files

In [11]:
import joblib

joblib.dump(model, "bbc_world_model.pkl")
joblib.dump(tfidf_vectorizer, "bbc_world_vectorizer.pkl")

['bbc_world_vectorizer.pkl']