## Imports

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import joblib
import utils

## Load data from directories

In [10]:
data_df = utils.load_data_from_directories(articles_dir='./data/news', summaries_dir='./data/summaries')

label_encoder = LabelEncoder()
data_df['category_encoded'] = label_encoder.fit_transform(data_df['category'])

data_df.head()

Unnamed: 0,content,type,filename,category,category_encoded
0,Musicians to tackle US red tape\n\nMusicians' ...,article,289.txt,entertainment,1
1,"U2's desire to be number one\n\nU2, who have w...",article,262.txt,entertainment,1
2,Rocker Doherty in on-stage fight\n\nRock singe...,article,276.txt,entertainment,1
3,Snicket tops US box office chart\n\nThe film a...,article,060.txt,entertainment,1
4,Ocean's Twelve raids box office\n\nOcean's Twe...,article,074.txt,entertainment,1


## Processing data

In [11]:
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
train_X = vectorizer.fit_transform(train_df['content']).toarray()
test_X = vectorizer.transform(test_df['content']).toarray()

train_Y = train_df['category_encoded']
test_Y = test_df['category_encoded']

## Train Random Forest Classifier

In [12]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_X, train_Y)


joblib.dump(rf_model, 'saved_models/random_forest_model.pt')
print('Model saved to saved_models/random_forest_model.pt')

Model saved to saved_models/random_forest_model.pt


## Evaluate the model

In [13]:
rf_predictions = rf_model.predict(test_X)
rf_accuracy = accuracy_score(test_Y, rf_predictions)
rf_precision, rf_recall, rf_f1, _ = precision_recall_fscore_support(test_Y, rf_predictions, average='weighted')

print(f'Random Forest Accuracy: {rf_accuracy * 100:.2f}%')
print(f'Random Forest Precision: {rf_precision:.4f}')
print(f'Random Forest Recall: {rf_recall:.4f}')
print(f'Random Forest F1 Score: {rf_f1:.4f}')

Random Forest Accuracy: 97.08%
Random Forest Precision: 0.9711
Random Forest Recall: 0.9708
Random Forest F1 Score: 0.9708
