In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import prep
from sklearn.feature_extraction.text import TfidfVectorizer
import acquire

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import acquire2

In [2]:
original = acquire2.get_news_articles()

In [3]:
df = original.copy()

In [5]:
df.head(2)

Unnamed: 0,author,body,category,published_date,title
0,Krishna Veera Vanamali,After a US jury found that Elon Musk did not d...,business,2019-12-07T16:34:59.000Z,My faith in humanity is restored: Musk after w...
1,Krishna Veera Vanamali,"British cave explorer Vernon Unsworth, who los...",business,2019-12-07T15:31:16.000Z,I'll take it on the chin: Cave explorer after ...


In [6]:
df['article_clean'] = df.body.apply(prep.normalize2)

In [7]:
df.head(2)

Unnamed: 0,author,body,category,published_date,title,article_clean
0,Krishna Veera Vanamali,After a US jury found that Elon Musk did not d...,business,2019-12-07T16:34:59.000Z,My faith in humanity is restored: Musk after w...,after jury found that elon musk did not defa...
1,Krishna Veera Vanamali,"British cave explorer Vernon Unsworth, who los...",business,2019-12-07T15:31:16.000Z,I'll take it on the chin: Cave explorer after ...,british cave explorer vernon unsworth who los...


In [9]:
df.drop(columns=['title','author','published_date','body'],inplace=True)

In [10]:
df.head(2)

Unnamed: 0,category,article_clean
0,business,after jury found that elon musk did not defa...
1,business,british cave explorer vernon unsworth who los...


In [None]:
df.article_clean = df.article_clean.apply(prep.stem2)

In [13]:
df = df[['article_clean','category']]
df = df.assign(original = df.article_clean)

df = df.assign(normalized = df.original.apply(prep.normalize2))

df = df.assign(stemmed = df.normalized.apply(prep.stem2))
#df = df.assign(lemmatized = df.normalized.apply(lemmatize))

df = df.assign(cleaned = df.stemmed.apply(prep.remove_stopwords2))
df.head()

Unnamed: 0,article_clean,category,original,normalized,stemmed,cleaned
0,after jury found that elon musk did not defa...,business,after jury found that elon musk did not defa...,after jury found that elon musk did not defa...,after juri found that elon musk did not defam ...,juri found elon musk defam british cave explor...
1,british cave explorer vernon unsworth who los...,business,british cave explorer vernon unsworth who los...,british cave explorer vernon unsworth who los...,british cave explor vernon unsworth who lost t...,british cave explor vernon unsworth lost defam...
2,lucknow based customer has filed fir against ...,business,lucknow based customer has filed fir against ...,lucknow based customer has filed fir against ...,lucknow base custom ha file fir against chines...,lucknow base custom ha file fir chines commerc...
3,price onion has shot per bengaluru du...,business,price onion has shot per bengaluru du...,price onion has shot per bengaluru du...,price onion ha shot per bengaluru due sever sh...,price onion ha shot per bengaluru due sever sh...
4,former rbi governor raghuram rajan article f...,business,former rbi governor raghuram rajan article f...,former rbi governor raghuram rajan article f...,former rbi governor raghuram rajan articl for ...,former rbi governor raghuram rajan articl indi...


In [14]:
df.category.value_counts(normalize=True)

entertainment    0.252525
sports           0.252525
technology       0.252525
business         0.242424
Name: category, dtype: float64

In [31]:
raw_count = pd.Series(" ".join(df.cleaned).split()).value_counts()

In [33]:
raw_count = raw_count.reset_index()

In [38]:
raw_count = raw_count[raw_count['index'].str.len() >2]
raw_count

Unnamed: 0,index,0
0,said,79
5,year,27
6,call,21
7,user,19
8,one,17
9,india,17
10,cricket,17
11,also,17
12,actor,16
13,like,15


In [39]:
def idf(word):
    n_occurrences = sum([1 for doc in df.cleaned if word in doc])
    n_docs = len(df.cleaned)
    idf = np.log(n_docs/n_occurrences)
    return idf

In [41]:
df['idf'] = df.cleaned.apply(idf)

In [43]:
unique_words = pd.Series(' '.join(df.cleaned).split()).unique()

In [45]:
len(unique_words)

1605

In [46]:
idf_df = pd.DataFrame(dict(word=unique_words))\
    .assign(idf = lambda df: df.word.apply(idf))\
    .set_index('word').sort_values(by='idf',ascending=False)
    

In [48]:
idf_df.head()

Unnamed: 0_level_0,idf
word,Unnamed: 1_level_1
blockbust,4.59512
iiser,4.59512
pune,4.59512
nba,4.59512
guerschon,4.59512


In [49]:
documents = df.cleaned.to_dict()
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(documents.values())

In [50]:
tfidfs

<99x1605 sparse matrix of type '<class 'numpy.float64'>'
	with 3131 stored elements in Compressed Sparse Row format>

In [52]:
df.head(2)

Unnamed: 0,article_clean,category,original,normalized,stemmed,cleaned,idf
0,after jury found that elon musk did not defa...,business,after jury found that elon musk did not defa...,after jury found that elon musk did not defa...,after juri found that elon musk did not defam ...,juri found elon musk defam british cave explor...,3.901973
1,british cave explorer vernon unsworth who los...,business,british cave explorer vernon unsworth who los...,british cave explorer vernon unsworth who los...,british cave explor vernon unsworth who lost t...,british cave explor vernon unsworth lost defam...,3.901973


In [51]:
features_df = pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())
features_df.head()

Unnamed: 0,aaryan,ab,abbey,abil,abl,abus,accent,accept,accompani,account,...,ye,year,young,younger,yourstori,zero,zilingo,zima,zomato,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.089602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [56]:
X = tfidf.fit_transform(df.cleaned)
y = df.category

X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=.3)

In [57]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

<69x1605 sparse matrix of type '<class 'numpy.float64'>'
	with 2183 stored elements in Compressed Sparse Row format>