# Классификация новостей 
### Иван Людвиг Терешко, Б03-901

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

In [2]:
df = pd.read_csv('news.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,news,type
0,0,UK economy facing 'major risks'\n \n The UK ma...,business
1,1,Aids and climate top Davos agenda\n \n Climate...,business
2,2,Asian quake hits European shares\n \n Shares i...,business
3,3,India power shares jump on debut\n \n Shares i...,business
4,4,Lacroix label bought by US firm\n \n Luxury go...,business


In [4]:
categories = df['type'].unique()
categories

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

У нас 5 категорий, пронумеруем их

In [5]:
df['type_id'] = LabelEncoder().fit_transform(df['type'].values)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,news,type,type_id
0,0,UK economy facing 'major risks'\n \n The UK ma...,business,0
1,1,Aids and climate top Davos agenda\n \n Climate...,business,0
2,2,Asian quake hits European shares\n \n Shares i...,business,0
3,3,India power shares jump on debut\n \n Shares i...,business,0
4,4,Lacroix label bought by US firm\n \n Luxury go...,business,0


In [7]:
count_vect = CountVectorizer(stop_words='english')
matrix_count = count_vect.fit_transform(df['news'])
words = [x[0] for x in sorted(count_vect.vocabulary_.items(), key=lambda x: x[1])]

y = df['type_id']

x_train, x_test, y_train, y_test = train_test_split(matrix_count, y, test_size=0.3)

Обучим модель с помощью MultinomialNB (наивный байесовский классификатор)

In [8]:
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [9]:
model.score(x_test, y_test)

0.9775449101796407

Посмотрим на самые важные слова в каждой категории

In [10]:
coefs = model.coef_
for i in range(0, len(categories)):
    print ('\n', f'TOP 3 for {categories[i]}')

    for idx in np.argsort(model.coef_)[i][::-1][:3]: # последние 3 с максимальным коэфициентом
        print ('{} = {:.2f}'.format(words[idx], model.coef_[i][idx]))

    print ('\n', f'BOTTOM 3 for {categories[i]}')
    for idx in np.argsort(model.coef_)[i][:3]: # последние 3 c минимальным коэфициентом
        print ('{} = {:.2f}'.format(words[idx], model.coef_[i][idx]))


 TOP 3 for business
said = -4.41
year = -5.29
mr = -5.41

 BOTTOM 3 for business
00 = -11.49
mira = -11.49
minx = -11.49

 TOP 3 for entertainment
said = -4.90
film = -4.92
best = -5.29

 BOTTOM 3 for entertainment
jd = -11.25
indicator = -11.25
shirakawa = -11.25

 TOP 3 for politics
said = -4.14
mr = -4.39
labour = -5.16

 BOTTOM 3 for politics
jd = -11.49
moguls = -11.49
mogul = -11.49

 TOP 3 for sport
said = -4.95
year = -5.61
england = -5.63

 BOTTOM 3 for sport
jd = -11.41
moustache = -11.41
mouskouri = -11.41

 TOP 3 for tech
said = -4.58
people = -5.03
mr = -5.63

 BOTTOM 3 for tech
jd = -11.54
metatarsal = -11.54
metallica = -11.54


Функция, которая принимает на вход строку и предсказывает, в какой она категории

In [11]:
def predict(title):
    cat = model.predict(count_vect.transform([title]))
    return categories[cat[0]]

In [12]:
predict('stocks are rising')

'business'

In [13]:
predict('the best basketball team')

'sport'

In [14]:
predict('british prime minister')

'politics'

Как видим, работает отлично.   
Теперь Попробуем обучить модель с помощью логистической регрессии

In [15]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(x_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [16]:
clf.score(x_test, y_test)

0.9655688622754491

Получаем результат хуже.

Попробуем обучить те же модели, используя TfidfVectorizer

In [17]:
tfidf_vect = TfidfVectorizer(stop_words='english')
matrix_count = tfidf_vect.fit_transform(df['news'])
x_train, x_test, y_train, y_test = train_test_split(matrix_count, y, test_size=0.3)

In [18]:
model = MultinomialNB()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9625748502994012

In [19]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.9655688622754491

Получаем результат хуже для первой модели. Для логистической регрессии без изменений.