In [None]:
import pandas as pd
import requests
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def read_csv_from_drive(file_path):
    return pd.read_csv(file_path, header=None, names=["class", "title", "description"])

train_file_path = "/content/drive/MyDrive/train.csv"
test_file_path = "/content/drive/MyDrive/test.csv"

train_df = read_csv_from_drive(train_file_path)
test_df = read_csv_from_drive(test_file_path)


train_df = train_df.apply(lambda x: x.astype(str).str.lower())


test_df = test_df.apply(lambda x: x.astype(str).str.lower())

train_df = train_df[train_df['class'] != 'class index']
test_df = test_df[test_df['class'] != 'class index']



train_df["text"] = (train_df["title"] + " " + train_df["description"])
test_df["text"] = (test_df["title"] + " " + test_df["description"])


X_train = train_df["text"]
y_train = train_df["class"]
X_test = test_df["text"]
y_test = test_df["class"]


In [None]:
start_time = time.time()
# Unigram model
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf1 = vectorizer.transform(X_test)

# Multinomial Naïve Bayes model
clf1 = MultinomialNB()
clf1.fit(X_train_tfidf, y_train)

y_pred1 = clf1.predict(X_test_tfidf1)

end_time = time.time()


names = vectorizer.get_feature_names_out()
dim1 = len(names)
execution_time1 = end_time - start_time

accuracy1=accuracy_score(y_test, y_pred1)
print("Dimensionality:",dim1)
print("\nAccuracy:\n", accuracy1)
print("\nClassification Report:\n", classification_report(y_test, y_pred1))


Dimensionality: 64999

Accuracy:
 0.9022368421052631

Classification Report:
               precision    recall  f1-score   support

           1       0.91      0.89      0.90      1900
           2       0.95      0.98      0.96      1900
           3       0.87      0.86      0.86      1900
           4       0.88      0.88      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



In [None]:
start_time = time.time()
# Trigram-character model
vectorizer = TfidfVectorizer(ngram_range=(3,3), analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf3 = vectorizer.transform(X_test)

# Multinomial Naïve Bayes model
clf2 = MultinomialNB()
clf2.fit(X_train_tfidf, y_train)


y_pred2 = clf2.predict(X_test_tfidf3)

end_time = time.time()


names = vectorizer.get_feature_names_out()
dim2 = len(names)
execution_time2 = end_time - start_time


accuracy2=accuracy_score(y_test, y_pred2)
print("Dimensionality:",dim2)
print("\nAccuracy:\n", accuracy2)
print("\nClassification Report:\n", classification_report(y_test, y_pred2))

Dimensionality: 31074

Accuracy:
 0.8686842105263158

Classification Report:
               precision    recall  f1-score   support

           1       0.87      0.89      0.88      1900
           2       0.90      0.96      0.93      1900
           3       0.86      0.79      0.82      1900
           4       0.84      0.84      0.84      1900

    accuracy                           0.87      7600
   macro avg       0.87      0.87      0.87      7600
weighted avg       0.87      0.87      0.87      7600



In [None]:
start_time=time.time()

# Unigram model
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf1 = vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with linear kernel and C=1
clf3 = LinearSVC(C=1)
clf3.fit(X_train_tfidf, y_train)

y_pred3 = clf3.predict(X_test_tfidf1)

end_time = time.time()

names = vectorizer.get_feature_names_out()
dim3 = len(names)
execution_time3 = end_time - start_time


accuracy3=accuracy_score(y_test, y_pred3)
print("Dimensionality:",dim3)
print("\nAccuracy:\n", accuracy3)
print("\nClassification Report:\n", classification_report(y_test, y_pred3))

Dimensionality: 64999

Accuracy:
 0.9196052631578947

Classification Report:
               precision    recall  f1-score   support

           1       0.93      0.90      0.92      1900
           2       0.96      0.98      0.97      1900
           3       0.89      0.89      0.89      1900
           4       0.90      0.90      0.90      1900

    accuracy                           0.92      7600
   macro avg       0.92      0.92      0.92      7600
weighted avg       0.92      0.92      0.92      7600



In [None]:
start_time = time.time()

# Trigram-character model
vectorizer = TfidfVectorizer(ngram_range=(3,3), analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf3 = vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with linear kernel and C=1
clf4 = LinearSVC(C=1)
clf4.fit(X_train_tfidf, y_train)

y_pred4 = clf4.predict(X_test_tfidf3)

end_time = time.time()


names = vectorizer.get_feature_names_out()
dim4 = len(names)
execution_time4 = end_time - start_time

accuracy4=accuracy_score(y_test, y_pred4)
print("Dimensionality:",dim4)
print("\nAccuracy:\n", accuracy4)
print("\nClassification Report:\n", classification_report(y_test, y_pred4))

Dimensionality: 31074

Accuracy:
 0.9121052631578948

Classification Report:
               precision    recall  f1-score   support

           1       0.93      0.91      0.92      1900
           2       0.95      0.97      0.96      1900
           3       0.88      0.88      0.88      1900
           4       0.89      0.89      0.89      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600



In [None]:
import pandas as pd

def format_values(value):
    if isinstance(value, float):
        if value.is_integer():
            return f"{int(value):,}"
        else:
            return f"{value:.3f}"
    return value



data = {
    "ΝΒ (word 1-grams)": [accuracy1, dim1, execution_time1],
    "NB (char 3-grams)": [accuracy2, dim2, execution_time2],
    "SVM (word 1-grams)": [accuracy3, dim3, execution_time3],
    "SVM (char 3-grams)": [accuracy4, dim4, execution_time4],
}

index = ["Accuracy (%)", "Dimensionality", "Time cost (s)"]

df = pd.DataFrame(data, index=index)

df = df.applymap(format_values)

df = df.style \
    .set_properties(**{'font-weight': 'bold', 'border': '2px solid black'}) \
    .set_table_styles([dict(selector='th', props=[('font-weight', 'bold'), ('border', '1px solid black')])])


display(df)

Unnamed: 0,ΝΒ (word 1-grams),NB (char 3-grams),SVM (word 1-grams),SVM (char 3-grams)
Accuracy (%),0.902,0.869,0.92,0.912
Dimensionality,64999.0,31074.0,64999.0,31074.0
Time cost (s),6.191,22.452,12.013,37.741


In [None]:

predictions = list(zip(y_pred1, y_pred2, y_pred3, y_pred4))


misclassified_indices = []
for i, (pred1, pred2, pred3, pred4) in enumerate(predictions):
    if (pred1 != y_test.iloc[i]) and (pred2 != y_test.iloc[i]) and (pred3 != y_test.iloc[i]) and (pred4 != y_test.iloc[i]):
        misclassified_indices.append(i)

misclassified_counts = {category: 0 for category in y_test.unique()}

# Find the most frequent pair of correct category and wrong prediction
pair_counts = {}

for idx in misclassified_indices:
    correct_category = y_test.iloc[idx]
    misclassified_counts[correct_category] += 1

    pair = (correct_category, y_pred1[idx])  

    if pair in pair_counts:
        pair_counts[pair] += 1
    else:
        pair_counts[pair] = 1

label_to_category = {
    "1": "1:World",
    "2": "2:Sports",
    "3": "3:Business",
    "4": "4:Sci/Tech",
}


misclassified_index = misclassified_indices[10]

misclassified_text = X_test.iloc[misclassified_index]

correct_category = y_test.iloc[misclassified_index]
predicted_category = y_pred1[misclassified_index]


print(f"Misclassified Text (Index: {misclassified_index}):")
print(misclassified_text)
print(f"\nShould have been classified as: {label_to_category[correct_category]}")
print(f"\nWas classified as: {label_to_category[predicted_category]}\n")


for category in sorted(misclassified_counts.keys()):
    category_name = label_to_category[category]
    count = misclassified_counts[category]
    print(f"{category_name}: {count} samples")

most_frequent_pair = max(pair_counts, key=pair_counts.get)
print("\nThe most frequent pair of correct category and wrong prediction:")
print(f"Correct category: {most_frequent_pair[0]}, Wrong prediction: {most_frequent_pair[1]}, Occurrences: {pair_counts[most_frequent_pair]}")

Misclassified Text (Index: 215):
hurricane center #39;s projection on charley not far off, data show fort lauderdale, fla. - (krt) - despite criticism that it should have better anticipated hurricane charley #39;s rapid intensification and quick turn, the national hurricane center #39;s forecast wasn #39;t that far off, a preliminary post-mortem shows. 

Should have been classified as: 4:Sci/Tech

Was classified as: 3:Business

1:World: 112 samples
2:Sports: 9 samples
3:Business: 135 samples
4:Sci/Tech: 85 samples

The most frequent pair of correct category and wrong prediction:
Correct category: 3, Wrong prediction: 4, Occurrences: 97
