# DDS: AG Newspapers

Implementation of Naive Bayes and Logistic Regression (SVM and RF, which for some or other reason did not make the cut).

# Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Data

In [2]:
df = pd.read_csv('clean_data.csv')
# but use train data and test data from Max instead for reproducibility!
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')
train_df.drop(columns=["title", "description", "source", "labels"], inplace=True)
test_df.drop(columns=["title", "description", "source", "labels"], inplace=True)

In [3]:
train_df.head()

Unnamed: 0,article,label_strings
0,title: New bug in open source database MySQL\n...,Sci/Tech
1,title: Football: Wenger looks forward to Chels...,Sports
2,title: Pakistan #39;s Senate passes bill allow...,World
3,title: Genetic Testing Challenges Medical Ethi...,Health
4,title: 'TwoDay' Method Helps Women Avoid Pregn...,Health


In [4]:
print(train_df.shape)
print(test_df.shape)

(221751, 2)
(24639, 2)


In [5]:
train_df["article"] = train_df["article"].str.replace("title:", "")
train_df["article"] = train_df["article"].str.replace("description:", "")
test_df["article"] = test_df["article"].str.replace("title:", "")
test_df["article"] = test_df["article"].str.replace("description:", "")
#df = df[~(df["labels"] == "Italia")]


In [6]:
df["labels"].value_counts()

labels
Sports                       49345
World                        47462
Business                     39478
Sci/Tech                     26572
Europe                       18822
Entertainment                15358
Health                       12430
Italia                       12347
Top News                      9508
U.S.                          7346
Top Stories                   4070
Toons                         2101
Software and Developement      941
Music Feeds                    610
Name: count, dtype: int64

# Split data into training and test sets

In [7]:
def split_train_test(df, size_of_test=0.3):
    X_train, X_test, y_train, y_test = train_test_split(df['article'], df['labels'], test_size=size_of_test, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_train_test(df)

def import_train_test(train_set, test_set):
    X_train = train_set["article"]
    y_train = train_set["label_strings"]
    X_test = test_set["article"]
    y_test = test_set["label_strings"]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = import_train_test(train_df, test_df)

In [8]:
X_train.shape

(221751,)

# TF-IDF

In [9]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [10]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(221751, 114820)
(24639, 114820)


# Naive Bayes

Classic, but bad results.

In [28]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)
y_pred_nb = nb_clf.predict(X_test_tfidf)
print("Classification Report for Naive Bayes:\n")
print(classification_report(y_test, y_pred_nb))

Classification Report for Naive Bayes:



  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

                 Business       0.72      0.89      0.80      3948
            Entertainment       0.81      0.36      0.50      1536
                   Europe       0.81      0.09      0.16      1882
                   Health       0.88      0.57      0.69      1243
                   Italia       1.00      1.00      1.00      1235
              Music Feeds       0.00      0.00      0.00        61
                 Sci/Tech       0.80      0.77      0.78      2657
Software and Developement       1.00      0.01      0.02        94
                   Sports       0.82      0.99      0.90      4934
                    Toons       1.00      1.00      1.00       210
                 Top News       1.00      0.41      0.58       951
              Top Stories       0.00      0.00      0.00       407
                     U.S.       1.00      0.00      0.01       735
                    World       0.55      0.90      0.68     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression Classifier

Classic with good (or acceptable) results.
Computationally cheap.

In [29]:
# Logistic Regression Classifier
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train_tfidf, y_train)
y_pred_lr = lr_clf.predict(X_test_tfidf)
print("Classification Report for Logistic Regression:\n")
print(classification_report(y_test, y_pred_lr))

Classification Report for Logistic Regression:

                           precision    recall  f1-score   support

                 Business       0.82      0.88      0.85      3948
            Entertainment       0.72      0.59      0.65      1536
                   Europe       0.70      0.62      0.66      1882
                   Health       0.82      0.83      0.83      1243
                   Italia       1.00      1.00      1.00      1235
              Music Feeds       1.00      0.52      0.69        61
                 Sci/Tech       0.80      0.84      0.82      2657
Software and Developement       1.00      0.43      0.60        94
                   Sports       0.90      0.97      0.93      4934
                    Toons       1.00      1.00      1.00       210
                 Top News       0.83      0.58      0.68       951
              Top Stories       0.32      0.02      0.05       407
                     U.S.       0.68      0.53      0.60       735
             

# SVM

Computationally expensive. Results on sampled data are similar to NB and LR.

In [34]:
#from each category, sample x rows or all rows if x > len(category)
# df.groupby("labels").apply(lambda x: x.sample(10000)).reset_index(drop=True)
def sample_data(df, s_size):
    return df.groupby('label_strings').apply(lambda x: x.sample(s_size) if len(x) > s_size else x.sample(len(x)))

df_s = sample_data(train_df, 5000)

In [35]:
X_train, X_test, y_train, y_test = import_train_test(df_s, test_df)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)


# check the cheap ones first
# Naive Bayes
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)
y_pred_nb = nb_clf.predict(X_test_tfidf)
print("Classification Report for Naive Bayes on reduced data dataset:\n")
print(classification_report(y_test, y_pred_nb))

# Logistic Regression
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train_tfidf, y_train)
y_pred_lr = lr_clf.predict(X_test_tfidf)
print("Classification Report for Logistic Regression on reduced data dataset:\n")
print(classification_report(y_test, y_pred_lr))

# SVC
svm_clf = SVC()
svm_clf.fit(X_train_tfidf, y_train)
y_pred_svm = svm_clf.predict(X_test_tfidf)
print("Classification Report for SVM on reduced data dataset:\n")
print(classification_report(y_test, y_pred_svm))

(56950, 71778)
(24639, 71778)
Classification Report for Naive Bayes on reduced data dataset:



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

                 Business       0.81      0.80      0.80      3948
            Entertainment       0.60      0.59      0.59      1536
                   Europe       0.62      0.49      0.55      1882
                   Health       0.65      0.90      0.76      1243
                   Italia       1.00      1.00      1.00      1235
              Music Feeds       0.00      0.00      0.00        61
                 Sci/Tech       0.78      0.78      0.78      2657
Software and Developement       1.00      0.09      0.16        94
                   Sports       0.91      0.95      0.93      4934
                    Toons       1.00      1.00      1.00       210
                 Top News       0.94      0.46      0.61       951
              Top Stories       0.16      0.01      0.02       407
                     U.S.       0.30      0.77      0.43       735
                    World       0.77      0.71      0.74     

# Random Forest

In [14]:
# ää ranfo Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_tfidf, y_train)
y_pred_rf = rf_clf.predict(X_test_tfidf)
print("Classification Report for Random Forest:\n")
print(classification_report(y_test, y_pred_rf))

# Draft for pipeline

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


df = pd.read_csv("clean_data.csv")

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['article'], df['labels'], test_size=0.2, random_state=42)

# Define a function to create pipelines for different classifiers
def create_pipeline(classifier):
    return Pipeline([
        ('tfidf', TfidfVectorizer()),  # Convert text to TF-IDF features
        ('clf', classifier),  # Classifier (SVM, Naive Bayes, Logistic Regression, etc.)
    ])

# Define the classifiers you want to use
classifiers = {
    'SVM': SVC(kernel='linear'),
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

# Train and evaluate each classifier
for name, classifier in classifiers.items():
    print(f"\nTraining and evaluating {name}...")
    pipeline = create_pipeline(classifier)
    pipeline.fit(X_train, y_train)  # Train the model
    y_pred = pipeline.predict(X_test)  # Predict on the test set
    
    # Print evaluation metrics
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_test, y_pred))


Training and evaluating SVM...
