In [1]:
from json import load
from typing import Union
import numpy as np
import pickle
import matplotlib.pyplot as plt

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn import metrics, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer


train_path = '../Data/Output/Chapter6/ex50-train.txt'
val_path = '../Data/Output/Chapter6/ex50-valid.txt'
test_path = '../Data/Output/Chapter6/ex50-test.txt'
model_path = '../Data/Output/Chapter6/model_logisticregression.pickle'

train_fe_path = train_path.replace('.txt', '_features.npy')
valid_fe_path = val_path.replace('.txt', '_features.npy')
test_fe_path = test_path.replace('.txt', '_features.npy')

df_train = pd.read_csv(train_path, sep='\t', header=0, index_col='id')
df_valid = pd.read_csv(val_path, sep='\t', header=0, index_col='id')
df_test = pd.read_csv(test_path, sep='\t', header=0, index_col='id')

vectorizer = TfidfVectorizer()

# def get_feature_datas(train_fe_path, val_fe_path, test_fe_path):
#     x_train = np.load(train_fe_path, allow_pickle=True)
#     x_valid = np.load(val_fe_path, allow_pickle=True)
#     x_test = np.load(test_fe_path, allow_pickle=True)
#     return x_train, x_valid, x_test


def preprocess_label(label_path: str) -> Union[np.array, preprocessing.LabelEncoder]:
    df = pd.read_csv(label_path, sep='\t')
    le = preprocessing.LabelEncoder()
    le.fit(df['category'])
    return le.transform(df['category']), le


def extract_features(fe_filepath, df: pd.DataFrame, is_train: bool):
    if is_train:
        x = vectorizer.fit_transform(df['title'].tolist()).toarray()
        x_display = vectorizer.fit_transform(df['title'].tolist())
    else:
        x = vectorizer.transform(df['title'].tolist()).toarray()
        x_display = vectorizer.transform(df['title'].tolist())
    return x, x_display


def get_labels() -> pd.DataFrame:
    # le = preprocessing.LabelEncoder()
    # le.fit(df_train['category'])
    # y = le.transform(df_train['category'])
    # return y
    y_train = df_train["category"]
    y_valid = df_valid["category"]
    y_test = df_test["category"]

    return y_train, y_valid, y_test


feature_x_train = extract_features(train_fe_path, df_train, True)
feature_x_valid = extract_features(valid_fe_path, df_valid, False)
feature_x_test = extract_features(test_fe_path, df_test, False)

y_train, y_valid, y_test = get_labels()

loaded_model = pickle.load(open(model_path, 'rb'))
y_encoded, le = preprocess_label(train_path)
feature_names = vectorizer.get_feature_names_out()

for i, coef in enumerate(loaded_model.coef_):
    print(i, coef)
    print(len(coef))

print("\nThe number of distinct words that the model specifies: ", len(feature_names))
print("\nClasses or categories: ", loaded_model.classes_, "\n")
classes_fullNames = ["business", "entertainment",
                     "health", "science and technology"]
for i, coef in enumerate(loaded_model.coef_):
    print('\n===TOP10 weights for class {} ({})==='.format(
        le.classes_[i], classes_fullNames[i]
    ))
    # argsort basically tells you which index in the array will give you the smallest value (ascending order)
    top10_indices = coef.argsort()[-10:][::-1]
    for idx in top10_indices:
        print(feature_names[idx], "\t tf-idf value: ", coef[idx])
    print('\n===LEAST10 weights for class {} ({})==='.format(
        le.classes_[i], classes_fullNames[i]
    ))
    top10_least_indices = coef.argsort()[:10]
    for idx in top10_least_indices:
        print(feature_names[idx], "\t tf-idf value: ", coef[idx])

0 [ 0.08539636 -0.0046265   0.08376712 ... -0.07522018 -0.09633064
 -0.01304822]
12858
1 [-0.05872894  0.00946988 -0.05406362 ...  0.1214726   0.12962142
  0.03232156]
12858
2 [-0.01155461 -0.00157579 -0.01246176 ... -0.01603853 -0.01334699
 -0.00721832]
12858
3 [-0.01511281 -0.00326759 -0.01724174 ... -0.03021389 -0.01994379
 -0.01205503]
12858

The number of distinct words that the model specifies:  12858

Classes or categories:  ['b' 'e' 'm' 't'] 


===TOP10 weights for class b (business)===
china 	 tf-idf value:  3.5052499766746004
fed 	 tf-idf value:  3.488303654816818
stocks 	 tf-idf value:  3.253319060231942
bank 	 tf-idf value:  3.213440039369961
ecb 	 tf-idf value:  3.1033591235430773
euro 	 tf-idf value:  2.885142771664616
oil 	 tf-idf value:  2.8323173049250183
update 	 tf-idf value:  2.6262492570052505
ukraine 	 tf-idf value:  2.6227784764210447
yellen 	 tf-idf value:  2.422397553969636

===LEAST10 weights for class b (business)===
and 	 tf-idf value:  -2.280453026627918
th