In [1]:
import pyforest

In [31]:
df = pd.read_csv("./datasets/cat_train.csv")
df = df.drop("Unnamed: 0", axis=1)
df.dropna(inplace=True)
df

<IPython.core.display.Javascript object>

Unnamed: 0,Text,Category
0,An Australian police officer has been charged ...,australia
1,Three people have been killed in an auto shop ...,us
2,The trial in the assault and harassment case a...,entertainment
3,Soccer is set to trial sin bins at the higher ...,sport
4,"Bob Iger, neatly dressed in a gray suit and pr...",media
...,...,...
7383,Kevin McCarthy has for now lost the House spea...,politics
7384,"Michael Duane Zack III, who was convicted of t...",us
7385,Seven Starbucks locations across San Francisco...,business
7386,"At least 21 people were killed, including two ...",europe


In [32]:
data = pd.read_csv("./datasets/all_articles.csv")
data.drop("Unnamed: 0", axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [33]:
data.dropna(inplace=True)
data = data[data["cat_label"]!= "Breaking News"]
data["cat_label"].value_counts()

Politics                         455
Entertainment and Lifestyle      156
International News                90
Sports                            78
Health | Science | Technology     24
Business and Economy              20
Law and Justice                   20
Climate and Environment           12
Name: cat_label, dtype: int64

In [34]:
df["Category"] = df["Category"].apply(lambda x: x.lower())

In [35]:
def categorize_categories(x):
    category_mapping = {
        'International News': ['world',"ukraine", 'australia','india', 'china', 'americas', 'middleeast','international','Israel Hamas War', 'africa', 'asia', 'europe'],
        'Politics': ['nation' , 'new york', 'Congress','us', 'politics'],
        'Business and Economy': ['investing', 'business', 'markets', 'money'],
        'Entertainment and Lifestyle': ["lifestyle","travel", 'entertainment', 'cars', 'culture', 'food', 'style', 'tech','advice', 'success', 'books', 'cruise ship', 'wellness', 'family', 'life expectancy'],
        'Climate and Environment': ['climate','energy' 'climate-environment', 'climate-solutions'],
        'Health | Science | Technology': ['health', 'science', 'technology', 'artificial intelligence'],
        'Sports': ['sport', 'sports'],
        'Law and Justice': ['national-security', 'criminajustice', "retail theft", "financial crimes", "crime",  ],
        'del' : ["weather" ] 
    }

    categorized_result = {category: [] for category in category_mapping.keys()}
    matched_category = None
    for main_category, sub_categories in category_mapping.items():
        if any(sub_category in x.lower() for sub_category in sub_categories):
            matched_category = main_category
            break
    
    return matched_category

df["cat_label"] = df["Category"].apply(categorize_categories)

In [36]:
df = df[df["cat_label"]!="del"]
df = df[df["cat_label"]!="politics"]
df.dropna(inplace=True)
data = data[["Text", "cat_label"]]
df = df[["Text", "cat_label"]]

In [37]:
df = pd.concat([data, df])

<IPython.core.display.Javascript object>

In [38]:
df.drop_duplicates(keep="first", inplace=True)
df

Unnamed: 0,Text,cat_label
0,Former President Donald Trump and Florida Gov....,Politics
1,Ashley Renne posted a more than minute-long vi...,Politics
2,Beyond Columbia University’s heavy iron gates ...,Politics
3,The recent debate over Beyoncé’s appearance at...,Politics
6,North Korea has warned any potential interfere...,International News
...,...,...
7383,Kevin McCarthy has for now lost the House spea...,Politics
7384,"Michael Duane Zack III, who was convicted of t...",Politics
7385,Seven Starbucks locations across San Francisco...,Politics
7386,"At least 21 people were killed, including two ...",International News


In [39]:
df["cat_label"].value_counts()

Politics                         1857
Entertainment and Lifestyle       837
International News                396
Sports                            371
Health | Science | Technology     118
Business and Economy               43
Climate and Environment            22
Law and Justice                    21
Name: cat_label, dtype: int64

## Tokenize etc

In [40]:
import nltk #Natural Language tool kit -- this pacakge is quite a mess. Was poorly design and the documentation is not great
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [41]:
def tokenizer_and_remove_punctuation(row):
    tokens = word_tokenize(row['Text'])
    return [word.lower() for word in tokens if word.isalpha()]

df['tokenized'] = df.apply(tokenizer_and_remove_punctuation,axis=1)
df.head()

Unnamed: 0,Text,cat_label,tokenized
0,Former President Donald Trump and Florida Gov....,Politics,"[former, president, donald, trump, and, florid..."
1,Ashley Renne posted a more than minute-long vi...,Politics,"[ashley, renne, posted, a, more, than, video, ..."
2,Beyond Columbia University’s heavy iron gates ...,Politics,"[beyond, columbia, university, s, heavy, iron,..."
3,The recent debate over Beyoncé’s appearance at...,Politics,"[the, recent, debate, over, beyoncé, s, appear..."
6,North Korea has warned any potential interfere...,International News,"[north, korea, has, warned, any, potential, in..."


In [42]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word], lang='eng')[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [44]:
def lemmatizer_with_pos(row):
      return [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in row['tokenized']]

df['lemmatized'] = df.apply(lemmatizer_with_pos,axis=1)
df.head()

Unnamed: 0,Text,cat_label,tokenized,lemmatized
0,Former President Donald Trump and Florida Gov....,Politics,"[former, president, donald, trump, and, florid...","[former, president, donald, trump, and, florid..."
1,Ashley Renne posted a more than minute-long vi...,Politics,"[ashley, renne, posted, a, more, than, video, ...","[ashley, renne, post, a, more, than, video, to..."
2,Beyond Columbia University’s heavy iron gates ...,Politics,"[beyond, columbia, university, s, heavy, iron,...","[beyond, columbia, university, s, heavy, iron,..."
3,The recent debate over Beyoncé’s appearance at...,Politics,"[the, recent, debate, over, beyoncé, s, appear...","[the, recent, debate, over, beyoncé, s, appear..."
6,North Korea has warned any potential interfere...,International News,"[north, korea, has, warned, any, potential, in...","[north, korea, have, warn, any, potential, int..."


In [45]:
# remove stopwords

def remove_sw(row):
      return list(set(row['lemmatized']).difference(stopwords.words()))

df['no_stopwords'] = df.apply(remove_sw,axis=1)
df.head()

Unnamed: 0,Text,cat_label,tokenized,lemmatized,no_stopwords
0,Former President Donald Trump and Florida Gov....,Politics,"[former, president, donald, trump, and, florid...","[former, president, donald, trump, and, florid...","[event, state, recent, south, immigration, pop..."
1,Ashley Renne posted a more than minute-long vi...,Politics,"[ashley, renne, posted, a, more, than, video, ...","[ashley, renne, post, a, more, than, video, to...","[access, dozen, effective, instagram, event, c..."
2,Beyond Columbia University’s heavy iron gates ...,Politics,"[beyond, columbia, university, s, heavy, iron,...","[beyond, columbia, university, s, heavy, iron,...","[problem, harassment, channel, applicable, ins..."
3,The recent debate over Beyoncé’s appearance at...,Politics,"[the, recent, debate, over, beyoncé, s, appear...","[the, recent, debate, over, beyoncé, s, appear...","[internalize, instagram, mom, event, recent, w..."
6,North Korea has warned any potential interfere...,International News,"[north, korea, has, warned, any, potential, in...","[north, korea, have, warn, any, potential, int...","[country, state, south, defense, destroy, veri..."


In [46]:
def re_blob(row):
      return " ".join(row['no_stopwords'])

df['clean_blob'] = df.apply(re_blob,axis=1)
df.head()

Unnamed: 0,Text,cat_label,tokenized,lemmatized,no_stopwords,clean_blob
0,Former President Donald Trump and Florida Gov....,Politics,"[former, president, donald, trump, and, florid...","[former, president, donald, trump, and, florid...","[event, state, recent, south, immigration, pop...",event state recent south immigration popular g...
1,Ashley Renne posted a more than minute-long vi...,Politics,"[ashley, renne, posted, a, more, than, video, ...","[ashley, renne, post, a, more, than, video, to...","[access, dozen, effective, instagram, event, c...",access dozen effective instagram event country...
2,Beyond Columbia University’s heavy iron gates ...,Politics,"[beyond, columbia, university, s, heavy, iron,...","[beyond, columbia, university, s, heavy, iron,...","[problem, harassment, channel, applicable, ins...",problem harassment channel applicable instagra...
3,The recent debate over Beyoncé’s appearance at...,Politics,"[the, recent, debate, over, beyoncé, s, appear...","[the, recent, debate, over, beyoncé, s, appear...","[internalize, instagram, mom, event, recent, w...",internalize instagram mom event recent work pa...
6,North Korea has warned any potential interfere...,International News,"[north, korea, has, warned, any, potential, in...","[north, korea, have, warn, any, potential, int...","[country, state, south, defense, destroy, veri...",country state south defense destroy verify dem...


In [47]:
bow_vect = CountVectorizer(max_features=7500)
# fit creates one entry for each different word seen
X = bow_vect.fit_transform(df['clean_blob']).toarray()

In [126]:
import pickle
with open('bow_vect.pkl', 'wb') as file:
     pickle.dump(bow_vect, file)

In [48]:
y = df["cat_label"]

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

<IPython.core.display.Javascript object>

In [108]:
y_train.value_counts()

Politics                         1480
Entertainment and Lifestyle       668
International News                322
Sports                            303
Health | Science | Technology      97
Business and Economy               30
Climate and Environment            18
Law and Justice                    14
Name: cat_label, dtype: int64

In [109]:
X_train = pd.DataFrame(X_train)
y_train = y_train.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [110]:
from sklearn.utils import resample
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
train = pd.concat([X_train, y_train], axis=1)
politics = train[train["cat_label"]=="Politics"]
not_politics = train[train["cat_label"]!="Politics"]

politics_under = resample(politics,
                   replace=False,
                   n_samples=800,
                    random_state=0)

train_under = pd.concat([politics_under, not_politics])
X_train_under = train_under.drop("cat_label", axis=1)
y_train_under = train_under["cat_label"]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [111]:
from sklearn.utils import resample
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
train = pd.concat([X_train, y_train], axis=1)
health = train[train["cat_label"]=="Health | Science | Technology"]
not_health = train[train["cat_label"]!="Health | Science | Technology"]

health_over = resample(health,
                   replace=True,
                   n_samples=200,
                    random_state=0)

train_under = pd.concat([health_over, not_health])
X_train = train_under.drop("cat_label", axis=1)
y_train = train_under["cat_label"]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [112]:
from sklearn.utils import resample
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
train = pd.concat([X_train, y_train], axis=1)
buisness = train[train["cat_label"]=="Business and Economy"]
not_buisness = train[train["cat_label"]!="Business and Economy"]

buisness_over = resample(buisness,
                   replace=True,
                   n_samples=220,
                    random_state=0)

train_under = pd.concat([buisness_over, not_buisness])
X_train = train_under.drop("cat_label", axis=1)
y_train = train_under["cat_label"]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [113]:
from sklearn.utils import resample
y_train.reset_index(drop=True, inplace=True)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
train = pd.concat([X_train, y_train], axis=1)
clim = train[train["cat_label"]=="Climate and Environment"]
not_clim = train[train["cat_label"]!="Climate and Environment"]

clim_over = resample(clim,
                   replace=True,
                   n_samples=100,
                    random_state=0)

train_under = pd.concat([clim_over, not_clim])
X_train = train_under.drop("cat_label", axis=1)
y_train = train_under["cat_label"]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [114]:
from sklearn.utils import resample
y_train.reset_index(drop=True, inplace=True)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
train = pd.concat([X_train, y_train], axis=1)
law = train[train["cat_label"]=="Law and Justice"]
not_law = train[train["cat_label"]!="Law and Justice"]

law_over = resample(law,
                   replace=True,
                   n_samples=70,
                    random_state=0)

train_under = pd.concat([law_over, not_law])
X_train = train_under.drop("cat_label", axis=1)
y_train = train_under["cat_label"]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [115]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [28]:
best_model.fit(X_train, y_train)
y_pred= best_model.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
#print("Classification Report:\n", classification_report(y_test, y_pred))

NameError: name 'best_model' is not defined

In [124]:
log = LogisticRegression()

log.fit(X_train, y_train)
y_pred = log.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

<IPython.core.display.Javascript object>

Accuracy: 0.8308321964529332
Classification Report:
                                precision    recall  f1-score   support

         Business and Economy       0.43      0.23      0.30        13
      Climate and Environment       0.67      0.50      0.57         4
  Entertainment and Lifestyle       0.83      0.79      0.81       169
Health | Science | Technology       0.68      0.71      0.70        21
           International News       0.78      0.73      0.76        74
              Law and Justice       1.00      0.14      0.25         7
                     Politics       0.83      0.90      0.86       377
                       Sports       0.97      0.90      0.93        68

                     accuracy                           0.83       733
                    macro avg       0.77      0.61      0.65       733
                 weighted avg       0.83      0.83      0.83       733



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [125]:
import pickle
with open('cat_model.pkl', 'wb') as file:
    pickle.dump(log, file)

In [121]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
#print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8240109140518418


In [120]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7776261937244202
Classification Report:
                                precision    recall  f1-score   support

         Business and Economy       1.00      0.08      0.14        13
      Climate and Environment       0.67      0.50      0.57         4
  Entertainment and Lifestyle       0.86      0.66      0.75       169
Health | Science | Technology       0.88      0.33      0.48        21
           International News       0.86      0.32      0.47        74
              Law and Justice       0.00      0.00      0.00         7
                     Politics       0.72      0.96      0.83       377
                       Sports       0.98      0.90      0.94        68

                     accuracy                           0.78       733
                    macro avg       0.75      0.47      0.52       733
                 weighted avg       0.80      0.78      0.75       733



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag']}

# Initialize Logistic Regression model
logreg = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

<IPython.core.display.Javascript object>

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.83


In [123]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

# Create an SVC model
svc = SVC()

# Create a GridSearchCV object
grid_search = GridSearchCV(svc, param_grid, cv=3, scoring='accuracy')

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Print results
print("Best Parameters:", best_params)
print("Accuracy on Test Set:", accuracy)

Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy on Test Set: 0.8308321964529332


In [119]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7694406548431105