In [1]:
import pandas as pd 
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
news = pd.read_json("News_Category_Dataset_v2.json", lines=True)

In [3]:
news.head(10)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26
6,ENTERTAINMENT,Donald Trump Is Lovin' New McDonald's Jingle I...,Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,"It's catchy, all right.",2018-05-26
7,ENTERTAINMENT,What To Watch On Amazon Prime That’s New This ...,Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,There's a great mini-series joining this week.,2018-05-26
8,ENTERTAINMENT,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,"Myer's kids may be pushing for a new ""Powers"" ...",2018-05-26
9,ENTERTAINMENT,What To Watch On Hulu That’s New This Week,Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,You're getting a recent Academy Award-winning ...,2018-05-26


In [4]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [5]:
news = news.drop(["authors", "link", "date"], axis=1)

## Удаление строк

- Если заголовок и описание - пустые
- Дубликаты заголовка

Удалим записи, в которых пустой либо заголовк, либо описание

In [6]:
news = news.drop(news[(news["headline"] == "") | (news["short_description"] == "")].index, 
                 axis=0)

In [7]:
news["headline"].value_counts()

Sunday Roundup                                                                  90
The 20 Funniest Tweets From Women This Week                                     75
Weekly Roundup of eBay Vintage Clothing Finds (PHOTOS)                          59
Weekly Roundup of eBay Vintage Home Finds (PHOTOS)                              54
Watch The Top 9 YouTube Videos Of The Week                                      46
                                                                                ..
The Best Buffalo Wings in America                                                1
Here's Why A Higher SPF Isn't Always Better                                      1
When Love is Too Much But Not Enough                                             1
GMO Food Labeling Bill Passes In The Senate                                      1
Ex-Cons Pick Up Inmates On 1st Day Out To Help Them Transition Into New Life     1
Name: headline, Length: 179765, dtype: int64

Удалили все новости с заголовком Sunday Roundup, кроме первой 

In [8]:
locs_for_delete = list(news[news["headline"] == "Sunday Roundup"].index[1:])

In [9]:
news = news.drop(locs_for_delete, 
                 axis=0)

In [10]:
news

Unnamed: 0,category,headline,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ..."
...,...,...,...
200848,TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,Verizon Wireless and AT&T are already promotin...
200849,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...,"Afterward, Azarenka, more effusive with the pr..."
200850,SPORTS,"Giants Over Patriots, Jets Over Colts Among M...","Leading up to Super Bowl XLVI, the most talked..."
200851,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...,CORRECTION: An earlier version of this story i...


In [11]:
news[(news["headline"] == "The 20 Funniest Tweets From Women This Week")]

Unnamed: 0,category,headline,short_description
68,WOMEN,The 20 Funniest Tweets From Women This Week,"""Welcome to adulthood. You have a favorite spa..."
429,WOMEN,The 20 Funniest Tweets From Women This Week,"""Coffee mugs do a lot of heavy lifting when it..."
786,WOMEN,The 20 Funniest Tweets From Women This Week,"""On a scale of Kanye to Donald Glover, how are..."
1171,WOMEN,The 20 Funniest Tweets From Women This Week,"""I'm already stressed out about summer being o..."
1942,WOMEN,The 20 Funniest Tweets From Women This Week,"""Running shoes? No, I don't run. These are my ..."
...,...,...,...
74185,WOMEN,The 20 Funniest Tweets From Women This Week,The ladies of Twitter never fail to brighten o...
76082,WOMEN,The 20 Funniest Tweets From Women This Week,The ladies of Twitter never fail to brighten o...
81129,WOMEN,The 20 Funniest Tweets From Women This Week,The ladies of Twitter never fail to brighten o...
81724,WOMEN,The 20 Funniest Tweets From Women This Week,The ladies of Twitter never fail to brighten o...


In [12]:
news["short_description"].value_counts()

Welcome to the HuffPost Rise Morning Newsbrief, a short wrap-up of the news to help you start your day.                              192
The stress and strain of constantly being connected can sometimes take your life -- and your well-being -- off course. GPS           125
Want more? Be sure to check out HuffPost Style on Twitter, Facebook, Tumblr, Pinterest and Instagram at @HuffPostStyle. -- Do         91
Do you have a home story idea or tip? Email us at homesubmissions@huffingtonpost.com. (PR pitches sent to this address will           75
We all need help maintaining our personal spiritual practice. We hope that these Daily Meditations, prayers and mindful awareness     71
                                                                                                                                    ... 
Turns out fairytales ARE real after all. By day, Gorge de Coaticook is one of Quebec's most precious nature parks with the             1
"Eyebrows are deeply important to me, and

In [13]:
news[news["short_description"] == "Welcome to the HuffPost Rise Morning Newsbrief, a short wrap-up of the news to help you start your day."]

Unnamed: 0,category,headline,short_description
51112,POLITICS,HuffPost Rise: What You Need To Know On August 9,Welcome to the HuffPost Rise Morning Newsbrief...
51228,POLITICS,HuffPost Rise: What You Need To Know On August 8,Welcome to the HuffPost Rise Morning Newsbrief...
51435,POLITICS,HuffPost Rise: What You Need To Know On August 5,Welcome to the HuffPost Rise Morning Newsbrief...
51569,POLITICS,HuffPost Rise: What You Need To Know On August 4,Welcome to the HuffPost Rise Morning Newsbrief...
51662,POLITICS,HuffPost Rise: What You Need To Know On August 3,Welcome to the HuffPost Rise Morning Newsbrief...
...,...,...,...
78329,POLITICS,"HuffPost Rise Morning Newsbrief, October 5",Welcome to the HuffPost Rise Morning Newsbrief...
78611,POLITICS,"HuffPost Rise Morning Newsbrief, October 2",Welcome to the HuffPost Rise Morning Newsbrief...
78666,POLITICS,"HuffPost Rise Morning Newsbrief, October 1",Welcome to the HuffPost Rise Morning Newsbrief...
78758,POLITICS,"HuffPost Rise Morning Newsbrief, September 30",Welcome to the HuffPost Rise Morning Newsbrief...


In [14]:
news[news["headline"].str.contains("HuffPost Rise: What You Need To Know On") # выбрать записи, в которых headline содержит HuffPost Rise: What You Need To Know On
    ]

Unnamed: 0,category,headline,short_description
49607,POLITICS,HuffPost Rise: What You Need To Know On August...,Donald Trump’s campaign chief is under scrutin...
51112,POLITICS,HuffPost Rise: What You Need To Know On August 9,Welcome to the HuffPost Rise Morning Newsbrief...
51228,POLITICS,HuffPost Rise: What You Need To Know On August 8,Welcome to the HuffPost Rise Morning Newsbrief...
51435,POLITICS,HuffPost Rise: What You Need To Know On August 5,Welcome to the HuffPost Rise Morning Newsbrief...
51569,POLITICS,HuffPost Rise: What You Need To Know On August 4,Welcome to the HuffPost Rise Morning Newsbrief...
...,...,...,...
72673,POLITICS,HuffPost Rise: What You Need To Know On Decemb...,Welcome to the HuffPost Rise Morning Newsbrief...
72946,POLITICS,HuffPost Rise: What You Need To Know On Decemb...,Welcome to the HuffPost Rise Morning Newsbrief...
73024,POLITICS,HuffPost Rise: What You Need To Know On Decemb...,Welcome to the HuffPost Rise Morning Newsbrief...
73101,POLITICS,HuffPost Rise: What You Need To Know On Decemb...,Welcome to the HuffPost Rise Morning Newsbrief...


In [15]:
news["headline"] = news["headline"].apply(lambda x: "HuffPost Rise: What You Need To Know On" if "HuffPost Rise: What You Need To Know On" in x else x)

In [16]:
news["headline"] = news["headline"].apply(lambda x: "HuffPost Rise Morning Newsbrief" if "HuffPost Rise Morning Newsbrief" in x else x)

## Предобработка

In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anastasialobkina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anastasialobkina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
"you're": "you are",
"you'll": "you will",
"you've": "you have",
"you'd": "you would",
"you'd've": "you would have",
"you'll've": "you will have",
"here's" : "here is",
"there's" : "there is",
"where's": "where is"
}

In [19]:
def proprocess(text):
    text = re.sub("[\W ]+", " ", text)
    text = text.lower()
    for cont in contractions:
        if cont in text:
            text = text.replace(cont, contractions[cont])
    return text

In [20]:
stop_words = stopwords.words('english')

def remove_stops(text):
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return text

In [22]:
text = "Metropolitan Community Churches are proud to say marriage equality has been part of our DNA for almost 50 years. One of the first things our founder, Rev. Elder Troy Perry, did after forming Metropolitan Community Church in 1968 was to begin performing marriage ceremonies."

In [22]:
news["headline_pr"] = news["headline"].apply(lambda x: remove_stops(proprocess(x)))
news["short_description_pr"] = news["short_description"].apply(lambda x: remove_stops(proprocess(x)))
news

Unnamed: 0,category,headline,short_description,headline_pr,short_description_pr
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...,"[2, mass, shootings, texas, last, week, 1, tv]","[left, husband, killed, children, another, day..."
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.,"[smith, joins, diplo, nicky, jam, 2018, world,...","[course, song]"
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...,"[hugh, grant, marries, first, time, age, 57]","[actor, longtime, girlfriend, anna, eberstein,..."
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...,"[jim, carrey, blasts, castrato, adam, schiff, ...","[actor, gives, dems, ass, kicking, fighting, h..."
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ...","[julianna, margulies, uses, donald, trump, poo...","[dietland, actress, said, using, bags, really,..."
...,...,...,...,...,...
200848,TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,Verizon Wireless and AT&T are already promotin...,"[rim, ceo, thorsten, heins, significant, plans...","[verizon, wireless, already, promoting, lte, d..."
200849,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...,"Afterward, Azarenka, more effusive with the pr...","[maria, sharapova, stunned, victoria, azarenka...","[afterward, azarenka, effusive, press, normal,..."
200850,SPORTS,"Giants Over Patriots, Jets Over Colts Among M...","Leading up to Super Bowl XLVI, the most talked...","[giants, patriots, jets, colts, among, improba...","[leading, super, bowl, xlvi, talked, game, cou..."
200851,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...,CORRECTION: An earlier version of this story i...,"[aldon, smith, arrested, 49ers, linebacker, bu...","[correction, earlier, version, story, incorrec..."


### Лемматизация

In [23]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [24]:
def lemmatize(text):
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

In [25]:
news["headline_pr"] = news["headline_pr"].apply(lambda x: lemmatize(x))
news["short_description_pr"] = news["short_description_pr"].apply(lambda x: lemmatize(x))

**Объединим столбцы заголовков и описания**

In [46]:
news["union_pr"] = news.apply(lambda x: " ".join(x["headline_pr"]) + " ".join(x["short_description_pr"]), axis=1)

## Получим вектора

In [47]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(news["union_pr"])
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [59]:
tf_idf = TfidfVectorizer()
X = vectorizer.fit_transform(news["union_pr"])

## LinearSVC

### Разделим данные 

In [48]:
news['category'].value_counts()

POLITICS          29489
WELLNESS          17825
ENTERTAINMENT     13470
STYLE & BEAUTY     9637
TRAVEL             9408
PARENTING          8676
FOOD & DRINK       6217
QUEER VOICES       5570
HEALTHY LIVING     5265
BUSINESS           5077
COMEDY             4417
SPORTS             4221
HOME & LIVING      4192
BLACK VOICES       4122
THE WORLDPOST      3664
WEDDINGS           3651
PARENTS            3556
DIVORCE            3426
WOMEN              3102
IMPACT             3061
CRIME              2675
MEDIA              2275
WEIRD NEWS         2209
WORLD NEWS         2175
TECH               2081
GREEN              2046
TASTE              1940
RELIGION           1857
SCIENCE            1775
MONEY              1706
STYLE              1567
ARTS & CULTURE     1339
ENVIRONMENT        1321
WORLDPOST          1242
FIFTY              1042
GOOD NEWS          1039
LATINO VOICES      1021
CULTURE & ARTS     1019
COLLEGE             921
EDUCATION           892
ARTS                863
Name: category, 

In [60]:
y = news['category']

In [61]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33) 

In [66]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=3) # Попробовать разные значения C

In [67]:
clf.fit(X_train, y_train)



LinearSVC(C=3, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [68]:
y_pred = clf.predict(X_test)
y_pred

array(['TRAVEL', 'POLITICS', 'HEALTHY LIVING', ..., 'STYLE & BEAUTY',
       'SPORTS', 'SCIENCE'], dtype=object)

In [55]:
print(metrics.classification_report(y_test, y_pred)) # Count, C = 0.01

                precision    recall  f1-score   support

          ARTS       0.48      0.14      0.22       298
ARTS & CULTURE       0.42      0.14      0.20       428
  BLACK VOICES       0.53      0.35      0.43      1364
      BUSINESS       0.52      0.43      0.47      1670
       COLLEGE       0.45      0.28      0.34       313
        COMEDY       0.62      0.38      0.47      1467
         CRIME       0.55      0.55      0.55       858
CULTURE & ARTS       0.51      0.23      0.32       347
       DIVORCE       0.79      0.66      0.72      1165
     EDUCATION       0.52      0.29      0.38       292
 ENTERTAINMENT       0.54      0.73      0.62      4473
   ENVIRONMENT       0.52      0.23      0.32       454
         FIFTY       0.47      0.09      0.15       346
  FOOD & DRINK       0.60      0.76      0.67      2028
     GOOD NEWS       0.51      0.06      0.11       348
         GREEN       0.48      0.34      0.40       672
HEALTHY LIVING       0.40      0.08      0.13  

In [65]:
print(metrics.classification_report(y_test, y_pred)) # Tf-idf, C = 0.01

                precision    recall  f1-score   support

          ARTS       0.51      0.13      0.20       281
ARTS & CULTURE       0.46      0.13      0.20       449
  BLACK VOICES       0.56      0.35      0.43      1340
      BUSINESS       0.52      0.42      0.47      1733
       COLLEGE       0.48      0.31      0.38       309
        COMEDY       0.59      0.39      0.47      1392
         CRIME       0.56      0.53      0.55       876
CULTURE & ARTS       0.48      0.25      0.33       358
       DIVORCE       0.80      0.66      0.72      1085
     EDUCATION       0.44      0.24      0.31       292
 ENTERTAINMENT       0.56      0.74      0.64      4523
   ENVIRONMENT       0.49      0.22      0.30       453
         FIFTY       0.57      0.09      0.16       344
  FOOD & DRINK       0.60      0.75      0.66      2092
     GOOD NEWS       0.58      0.06      0.11       347
         GREEN       0.41      0.32      0.36       668
HEALTHY LIVING       0.40      0.08      0.14  

In [69]:
print(metrics.classification_report(y_test, y_pred))

                precision    recall  f1-score   support

          ARTS       0.28      0.13      0.18       281
ARTS & CULTURE       0.29      0.24      0.26       449
  BLACK VOICES       0.45      0.41      0.43      1340
      BUSINESS       0.40      0.36      0.38      1733
       COLLEGE       0.38      0.29      0.33       309
        COMEDY       0.43      0.41      0.42      1392
         CRIME       0.49      0.47      0.48       876
CULTURE & ARTS       0.36      0.22      0.28       358
       DIVORCE       0.70      0.67      0.68      1085
     EDUCATION       0.34      0.27      0.30       292
 ENTERTAINMENT       0.60      0.65      0.62      4523
   ENVIRONMENT       0.38      0.24      0.30       453
         FIFTY       0.27      0.17      0.21       344
  FOOD & DRINK       0.57      0.62      0.59      2092
     GOOD NEWS       0.24      0.20      0.22       347
         GREEN       0.32      0.32      0.32       668
HEALTHY LIVING       0.23      0.21      0.22  

In [57]:
import joblib

In [58]:
joblib.dump(clf, "LinearSVCmodel_v1")

['LinearSVCmodel_v1']