In [1]:
import pandas as pd
import numpy as np

In [2]:
news = pd.read_csv("News_Category_Dataset_v2_prep_full.csv")

## Сокращение датасета

In [3]:
for_remove = ["WOMEN", "HEALTHY LIVING", "IMPACT", "PARENTS", "PARENTING", "ENTERTAINMENT", "BLACK VOICES", "WELLNESS"]
news = news.drop(news[news["category"].isin(for_remove)].index)

In [4]:
len(news)

121651

In [5]:
counts = news["category"].value_counts() 
drop_counts = counts[counts < 2500].index

In [6]:
news = news.drop(news[news["category"].isin(drop_counts)].index)

In [7]:
len(news)

91428

In [8]:
counts = news["category"].value_counts()
drop_counts = counts[counts > 6000].index

In [9]:
for i in drop_counts:
    n = counts[i]-6000
    sample = news[news["category"] == i].sample(n)
    news = news.drop(sample.index)

In [10]:
len(news)

60780

In [11]:
news["category"].value_counts()

FOOD & DRINK      6000
POLITICS          6000
TRAVEL            6000
STYLE & BEAUTY    6000
QUEER VOICES      5515
BUSINESS          5073
COMEDY            4389
SPORTS            4205
HOME & LIVING     4188
THE WORLDPOST     3662
WEDDINGS          3651
DIVORCE           3423
CRIME             2674
Name: category, dtype: int64

In [12]:
news.head()

Unnamed: 0,category,headline_pr,short_description_pr,union_pr
0,CRIME,2 mass shooting texas last week 1 tv,left husband killed child another day america,2 mass shooting texas last week 1 tvleft husba...
13,POLITICS,trump crackdown immigrant parent put kid alrea...,last month health human service official revea...,trump crackdown immigrant parent put kid alrea...
30,POLITICS,jack johnson pardoned taboo sex still criminal...,new law fight sex trafficking target people os...,jack johnson pardoned taboo sex still criminal...
32,CRIME,rachel dolezal face felony charge welfare fraud,state prosecutor say almost 84 000 deposited b...,rachel dolezal face felony charge welfare frau...
37,COMEDY,trump new maga themed swimwear sink twitter,swimsuit make look racist,trump new maga themed swimwear sink twitterswi...


## Извлекаем признаки

### Tf-idf

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer(max_df=0.7, min_df=5)

In [None]:
X = vectorizer.fit_transform(news["union_pr"])
X = X.toarray()

In [18]:
with open("Tfidfvocab", "w", encoding="utf-8") as f:
    f.write("\n".join(vectorizer.get_feature_names()))

In [19]:
with open("Tfidfvocab", "r", encoding="utf-8") as f:
    print(len(f.readlines()))

15540


In [20]:
len(vectorizer.get_feature_names())

15540

### Word2Vec

In [119]:
import gensim.downloader
from gensim.models import Word2Vec

In [120]:
news_category = Word2Vec.load("word2vec_News_Category_Dataset.model")

In [121]:
vocab = news_category.wv.index_to_key

In [122]:
def doc2vec(text):
    return np.mean([news_category.wv[x] for x in text.split() if x in vocab], axis=0).reshape(1, -1)

In [123]:
%%time
news["vectorized"] = news["union_pr"].apply(lambda row: doc2vec(row))

Wall time: 1min 52s


In [124]:
X = np.concatenate(news["vectorized"].values)

# Обучение модели

## LinearSVC

In [81]:
from sklearn import feature_extraction, model_selection, metrics

In [82]:
y = news['category']

In [83]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33)

In [84]:
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings("ignore")

In [85]:
%%time
clf = LinearSVC(C=0.2)
clf.fit(X_train, y_train)

Wall time: 3.5 s


LinearSVC(C=0.2)

In [86]:
y_pred = clf.predict(X_test)
y_pred

array(['POLITICS', 'DIVORCE', 'TRAVEL', ..., 'COMEDY', 'QUEER VOICES',
       'QUEER VOICES'], dtype=object)

In [87]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

      BUSINESS       0.73      0.74      0.73      1737
        COMEDY       0.76      0.64      0.70      1448
         CRIME       0.77      0.76      0.76       892
       DIVORCE       0.84      0.78      0.81      1084
  FOOD & DRINK       0.83      0.87      0.85      2005
 HOME & LIVING       0.80      0.80      0.80      1420
      POLITICS       0.71      0.75      0.73      1889
  QUEER VOICES       0.85      0.79      0.82      1835
        SPORTS       0.81      0.84      0.83      1374
STYLE & BEAUTY       0.83      0.86      0.85      1970
 THE WORLDPOST       0.81      0.77      0.79      1217
        TRAVEL       0.76      0.82      0.78      1996
      WEDDINGS       0.83      0.82      0.83      1191

      accuracy                           0.79     20058
     macro avg       0.79      0.79      0.79     20058
  weighted avg       0.79      0.79      0.79     20058



In [90]:
clf.coef_.shape

(13, 15519)

### Наивный Байес

In [99]:
y = news['category']

In [100]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33)

In [101]:
from sklearn import naive_bayes

In [102]:
bayes = naive_bayes.MultinomialNB(alpha=0.5)

In [103]:
bayes.fit(X_train, y_train)

MultinomialNB(alpha=0.5)

In [107]:
y_pred = bayes.predict(X_test)
y_pred

array(['STYLE & BEAUTY', 'POLITICS', 'POLITICS', ..., 'POLITICS',
       'FOOD & DRINK', 'QUEER VOICES'], dtype='<U14')

In [110]:
y_pred_proba = bayes.predict_proba(X_test)
y_pred_proba

array([[0.00970324, 0.00458225, 0.00121561, ..., 0.00338306, 0.06796785,
        0.01068257],
       [0.06427525, 0.01797294, 0.05275862, ..., 0.037263  , 0.00851625,
        0.00691566],
       [0.04238632, 0.02829675, 0.01522726, ..., 0.01284707, 0.04519354,
        0.13832167],
       ...,
       [0.01053545, 0.10141462, 0.00295752, ..., 0.00696941, 0.00424752,
        0.0010759 ],
       [0.00371848, 0.00234435, 0.00102478, ..., 0.00046514, 0.01049949,
        0.00384957],
       [0.00934959, 0.00853844, 0.02719363, ..., 0.01393487, 0.00755855,
        0.00294349]])

In [109]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

      BUSINESS       0.72      0.76      0.74      1707
        COMEDY       0.79      0.58      0.67      1472
         CRIME       0.81      0.69      0.74       854
       DIVORCE       0.81      0.77      0.79      1109
  FOOD & DRINK       0.80      0.86      0.83      2047
 HOME & LIVING       0.85      0.69      0.76      1362
      POLITICS       0.65      0.78      0.71      1960
  QUEER VOICES       0.70      0.81      0.75      1806
        SPORTS       0.88      0.78      0.83      1408
STYLE & BEAUTY       0.81      0.85      0.83      2002
 THE WORLDPOST       0.83      0.69      0.76      1170
        TRAVEL       0.68      0.83      0.75      1972
      WEDDINGS       0.83      0.65      0.73      1189

      accuracy                           0.76     20058
     macro avg       0.78      0.75      0.76     20058
  weighted avg       0.77      0.76      0.76     20058



In [26]:
import joblib

In [91]:
joblib.dump(clf, "LinearSVCmodel_79%")

['LinearSVCmodel_79%']

In [642]:
joblib.dump(clf, "NaiveBayesmodel_76%")

['NaiveBayesmodel_76%']