In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocess the data

1. Load the dataset
2. Tokenize
3. Eliminate stop words

In [20]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# Display the target names to see what we need to classify into
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [21]:
list(filter(lambda x: x != '', newsgroups_train.data[0].split("\n")))

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu',
 'Organization: University of Maryland, College Park',
 'Lines: 15',
 ' I was wondering if anyone out there could enlighten me on this car I saw',
 'the other day. It was a 2-door sports car, looked to be from the late 60s/',
 'early 70s. It was called a Bricklin. The doors were really small. In addition,',
 'the front bumper was separate from the rest of the body. This is ',
 'all I know. If anyone can tellme a model name, engine specs, years',
 'of production, where this car is made, history, or whatever info you',
 'have on this funky looking car, please e-mail.',
 'Thanks,',
 '- IL',
 '   ---- brought to you by your neighborhood Lerxst ----']

In [22]:
# Create a tf-idf vectorizer
tfidfvectorizer = TfidfVectorizer(analyzer= 'word', stop_words= 'english',
                                  min_df= 0.01, # Ignore terms that appear in less than 1% of the documents
                                  max_df= 0.80) # Ignore terms that appear in more than 80% of the documents

# Transform the data into a tf-idf matrix
Xtr             = tfidfvectorizer.fit_transform(newsgroups_train.data)
features        = tfidfvectorizer.get_feature_names_out()

Xts             = tfidfvectorizer.transform(newsgroups_test.data)

# Create a dataframe from the tf-idf matrix
df_tfidf        = pd.DataFrame(Xtr.toarray(), columns=features)

df_tfidf.head(5)

Unnamed: 0,00,000,01,02,03,04,05,07,08,10,...,wrong,wrote,yeah,year,years,yes,yesterday,york,young,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.097449,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.109088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.082008,0.0,0.0,0.0,0.0


## Visualize the data
1. Histogram
2. Boxplot

In [5]:
# Plot the top 20 words in the corpus
number_of_words = 20
px.histogram(df_tfidf.sum().sort_values(ascending=False).head(number_of_words),
             x= df_tfidf.sum().sort_values(ascending=False).head(number_of_words).index,
             y= df_tfidf.sum().sort_values(ascending=False).head(number_of_words).values)

In [6]:
# Create a boxplot of the sum of each column
px.box(df_tfidf.sum())

## Construct the neural network

In [7]:
from keras.models import Sequential
from keras.layers import Dense

# Define the model
model = Sequential()

# Create the input layer
model.add(Dense(100, activation='relu', input_shape=(len(features),)))

# Create the hidden layers
model.add(Dense(100, activation='relu'))

# Create the output layer
model.add(Dense(len(newsgroups_train.target_names), activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'Precision', 'Recall', 'f1_score'])

model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



## Train the model
1. Fit the model
2. Adjust the hyperparameters:
    1. Optimizer
    2. Learning rate
    3. Epochs

In [8]:
# Convert the target into one-hot encoding
ytr = pd.get_dummies(newsgroups_train.target).values

yts = pd.get_dummies(newsgroups_test.target).values

ytr

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [9]:
# Fit the model
model.fit(Xtr, ytr, epochs=10, verbose=1, batch_size=256)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.1344 - f1_score: 0.1128 - loss: 2.9515
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - Precision: 0.5884 - Recall: 0.0145 - accuracy: 0.6147 - f1_score: 0.5637 - loss: 2.2361
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - Precision: 0.9629 - Recall: 0.3221 - accuracy: 0.7719 - f1_score: 0.7525 - loss: 1.1598
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - Precision: 0.9525 - Recall: 0.6136 - accuracy: 0.8383 - f1_score: 0.8320 - loss: 0.7167
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - Precision: 0.9570 - Recall: 0.7414 - accuracy: 0.8789 - f1_score: 0.8751 - loss: 0.5234
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - Precision: 0.9603 - Recall: 0.

<keras.src.callbacks.history.History at 0x21299753b30>

## Visualize the learning process
1. Plot the loss function
2. Plot the accuracy

In [10]:
# Plot the loss function
px.line(model.history.history, y=['loss'])

In [11]:
# Plot the metrics of the model [accuracy, precision, recall]
px.line(model.history.history, x=model.history.epoch, y=['accuracy', 'Precision', 'Recall'], title='Metrics of the model')

## Evaluate the model
1. Calculate the accuracy
2. Calculate the precision
3. Calculate the recall
4. Calculate the F1 score

In [12]:
# Calculate the accuracy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(Xts)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(yts, axis=1)

print("Accuracy in the test set: ", accuracy_score(y_true, y_pred))

print("Precision in the test set: ", precision_score(y_true, y_pred, average='weighted'))

print("Recall in the test set: ", recall_score(y_true, y_pred, average='weighted'))

print("F1 score in the test set: ", f1_score(y_true, y_pred, average='weighted'))

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy in the test set:  0.7140201805629315
Precision in the test set:  0.7207633923328405
Recall in the test set:  0.7140201805629315
F1 score in the test set:  0.7155099933193312


## Visualize the confusion matrix

In [13]:
# Plot the confusion matrix
from sklearn.metrics import confusion_matrix
px.imshow(confusion_matrix(y_true, y_pred), text_auto=True)

## Try different architectures

In [14]:
# Create different model architectures
model2 = Sequential()
model2.add(Dense(1000, activation='relu', input_shape=(len(features),)))
model2.add(Dense(1000, activation='relu'))
model2.add(Dense(len(newsgroups_train.target_names), activation='softmax'))
model2.compile(optimizer='adam',
              loss='categorical_crossentropy')

model2.fit(Xtr, ytr, epochs=10, batch_size=256, verbose=0)

y_pred2 = np.argmax(model2.predict(Xts), axis=1)

print("f1_score in the test set with model2: ", f1_score(y_true, y_pred2, average='weighted'))

model2.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
f1_score in the test set with model2:  0.7207269536288745


In [15]:
model3 = Sequential()
model3.add(Dense(100, activation='relu', input_shape=(len(features),)))
model3.add(Dense(100, activation='relu'))
model3.add(Dense(100, activation='relu'))
model3.add(Dense(len(newsgroups_train.target_names), activation='softmax'))
model3.compile(optimizer='adam',
              loss='categorical_crossentropy')

model3.fit(Xtr, ytr, epochs=10, batch_size=256, verbose=0)

y_pred3 = np.argmax(model3.predict(Xts), axis=1)

print("f1_score in the test set with model3: ", f1_score(y_true, y_pred3, average='weighted'))

model3.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
f1_score in the test set with model3:  0.6717839373552982


In [16]:
model4 = Sequential()
model4.add(Dense(100, activation='relu', input_shape=(len(features),)))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(len(newsgroups_train.target_names), activation='softmax'))
model4.compile(optimizer='adam',
              loss='categorical_crossentropy')

model4.fit(Xtr, ytr, epochs=10, batch_size=256, verbose=0)

y_pred4 = np.argmax(model4.predict(Xts), axis=1)

print("f1_score in the test set with model4: ", f1_score(y_true, y_pred4, average='weighted'))

model4.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
f1_score in the test set with model4:  0.6577017439614047


In [17]:
# Compare the models
px.bar(x=['original model', 'model2', 'model3', 'model4'], y=[f1_score(y_true, y_pred, average='weighted'), f1_score(y_true, y_pred2, average='weighted'), f1_score(y_true, y_pred3, average='weighted'), f1_score(y_true, y_pred4, average='weighted')])

## Try using cross-validation

In [18]:
model_cv = Sequential()

model_cv.add(Dense(100, activation='relu', input_shape=(len(features),)))

model_cv.add(Dense(100, activation='relu'))

model_cv.add(Dense(len(newsgroups_train.target_names), activation='softmax'))

model_cv.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'Precision', 'Recall', 'f1_score'])

model_cv.fit(Xtr, ytr, epochs=10, batch_size=256, validation_split=0.2, verbose=0)

y_pred_cv = np.argmax(model_cv.predict(Xts), axis=1)

print("Accuracy in the test set with model_cv: ", accuracy_score(y_true, y_pred_cv))

#Compare model and model_cv
px.bar(x=['model', 'model_cv'], y=[accuracy_score(y_true, y_pred), accuracy_score(y_true, y_pred_cv)])


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy in the test set with model_cv:  0.705124800849708


## Visualize the ROC curve and AUC score

In [19]:
# Plot the ROC curve
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

binary_testLabels = label_binarize(y_true, classes=range(len(newsgroups_train.target_names)))

y_pred = model.predict(Xts)

fig = px.line()

for i in range(len(newsgroups_train.target_names)):
    fpr, tpr, thresholds = roc_curve(binary_testLabels[:, i],
                                     y_pred[:, i])
    roc_auc = auc(fpr, tpr)

    fig.add_scatter(x=fpr, y=tpr, name=f'{newsgroups_train.target_names[i]} (AUC = {roc_auc:.2f})', mode="lines")

fig.update_layout(title="ROC curve", xaxis_title="False Positive Rate", yaxis_title="True Positive Rate", showlegend=True)

fig.show()

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step




## Describe the findings

### Model Performance

- The original model performs well on the training set, with precision, recall, accuracy, and F1 score all above 90%.
- However, the performance on the test set is lower, with accuracy around 70%. This suggests that the model may be overfitting to the training data.
- The different architectures (model2, model3, model4) have varying levels of performance on the test set, there is a need to explore the architectures further to find the best one.
- The model with cross-validation (Model_cv) performs similarly to the original model, suggesting that cross-validation may not be improving the model's performance in this case.

### ROC Curve

- The ROC curve shows that the model is able to distinguish between the different classes, with the AUC reaching high values for all classes.
- This suggests that the model is able to capture the underlying patterns in the data and make accurate predictions.
- The ROC curve suggests that the model is able to perform well on all classes.

### Overall

- The model appears to be performing well, but there may be room for improvement, particularly in terms of reducing overfitting and improving performance on the test set.
- Further analysis and experimentation may be needed to fully understand the model's performance and to identify areas for improvement.