In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('data/articles1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [25]:
def tokenize_batch(series_batch):
    '''
    Токенизирует тексты из датафрейма, перед этим приводя к lowercase
    
    Returns
    -------
    tokenized : pandas.Series
        pandas.Series of list of str
    '''
    return series_batch.map(str.lower).map(word_tokenize)


tokenized_stopwords = set()
for word in stopwords.words('english'):
    tokenized_stopwords.update(word_tokenize(word))
reg_exp = re.compile(r"[\w']+$")


def is_bad_word(word):
    '''Определяет не слово ли или стоп-слово'''
    if re.match(reg_exp, word) is None or word in tokenized_stopwords:
        return True
    else:
        return False
    
    
def filter_tokenized_batch(series_batch):
    '''
    Удаляет из текстов неугодные слова.
    
    Returns
    -------
    filtered : pandas.Series
        pandas.Series of list of str
    '''
    return series_batch.map(
        lambda l: [word for word in l if not is_bad_word(word)]
    )

def stem_batch(series_batch):
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    return series_batch.map(lambda l: [ps.stem(elem) for elem in l])


def preprocess_batch(df_batch):
    return stem_batch(filter_tokenized_batch(tokenize_batch(df_batch['content'])))

In [27]:
preprocess_batch(df[:10])

0    [washington, congression, republican, new, fea...
1    [bullet, shell, get, count, blood, dri, votiv,...
2    [walt, disney, bambi, open, 1942, critic, prai...
3    [death, may, great, equal, necessarili, evenha...
4    [seoul, south, korea, north, korea, leader, ki...
5    [london, queen, elizabeth, ii, battl, cold, we...
6    [beij, presid, tsai, taiwan, sharpli, critic, ...
7    [danni, cahil, stood, slightli, daze, blizzard...
8    [hillari, kerr, founder, digit, media, compani...
9    [angel, everywher, muñiz, famili, apart, bronx...
Name: content, dtype: object

In [29]:
for batch in pd.read_csv('data/articles1.csv', chunksize=10):
    preprocess_batch(batch).to_csv('data/stemmed.csv', mode='a', index=False, header=None)

In [30]:
df2 = pd.read_csv('data/stemmed.csv', index_col=False, header=None)

In [31]:
df2.head()

Unnamed: 0,0
0,"['washington', 'congression', 'republican', 'n..."
1,"['bullet', 'shell', 'get', 'count', 'blood', '..."
2,"['walt', 'disney', 'bambi', 'open', '1942', 'c..."
3,"['death', 'may', 'great', 'equal', 'necessaril..."
4,"['seoul', 'south', 'korea', 'north', 'korea', ..."


In [20]:
word_tokenize('''Doesn. t dead''')

['Doesn', '.', 't', 'dead']

In [59]:
type(df2.iloc[0][0])

str

In [38]:
print(' '.join(df2.iloc[0].values.tolist()))

['WASHINGTON', '—', 'Congressional', 'Republicans', 'have', 'a', 'new', 'fear', 'when', 'it', 'comes', 'to', 'their', 'health', 'care', 'lawsuit', 'against', 'the', 'Obama', 'administration', ':', 'They', 'might', 'win', '.', 'The', 'incoming', 'Trump', 'administration', 'could', 'choose', 'to', 'no', 'longer', 'defend', 'the', 'executive', 'branch', 'against', 'the', 'suit', ',', 'which', 'challenges', 'the', 'administration', '’', 's', 'authority', 'to', 'spend', 'billions', 'of', 'dollars', 'on', 'health', 'insurance', 'subsidies', 'for', 'and', 'Americans', ',', 'handing', 'House', 'Republicans', 'a', 'big', 'victory', 'on', 'issues', '.', 'But', 'a', 'sudden', 'loss', 'of', 'the', 'disputed', 'subsidies', 'could', 'conceivably', 'cause', 'the', 'health', 'care', 'program', 'to', 'implode', ',', 'leaving', 'millions', 'of', 'people', 'without', 'access', 'to', 'health', 'insurance', 'before', 'Republicans', 'have', 'prepared', 'a', 'replacement', '.', 'That', 'could', 'lead', 'to',

In [7]:
import nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/makuhich/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/makuhich/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
df

NameError: name 'df' is not defined

In [15]:
df[:10]['content'].map(word_tokenize)

0    [WASHINGTON, —, Congressional, Republicans, ha...
1    [After, the, bullet, shells, get, counted, ,, ...
2    [When, Walt, Disney, ’, s, “, Bambi, ”, opened...
3    [Death, may, be, the, great, equalizer, ,, but...
4    [SEOUL, ,, South, Korea, —, North, Korea, ’, s...
5    [LONDON, —, Queen, Elizabeth, II, ,, who, has,...
6    [BEIJING, —, President, Tsai, of, Taiwan, shar...
7    [Danny, Cahill, stood, ,, slightly, dazed, ,, ...
8    [Just, how, is, Hillary, Kerr, ,, the, founder...
9    [Angels, are, everywhere, in, the, Muñiz, fami...
Name: content, dtype: object

In [16]:
df['content'][2]

'When Walt Disney’s “Bambi” opened in 1942, critics praised its spare, haunting visual style, vastly different from anything Disney had done before. But what they did not know was that the film’s striking appearance had been created by a Chinese immigrant artist, who took as his inspiration the landscape paintings of the Song dynasty. The extent of his contribution to “Bambi,” which remains a   mark for film animation, would not be widely known for decades. Like the film’s title character, the artist, Tyrus Wong, weathered irrevocable separation from his mother  —   and, in the hope of making a life in America, incarceration, isolation and rigorous interrogation  —   all when he was still a child. In the years that followed, he endured poverty, discrimination and chronic lack of recognition, not only for his work at Disney but also for his fine art, before finding acclaim in his 90s. Mr. Wong died on Friday at 106. A Hollywood studio artist, painter, printmaker, calligrapher,   illustr

In [19]:
word_tokenize(df['content'][2])

['When',
 'Walt',
 'Disney',
 '’',
 's',
 '“',
 'Bambi',
 '”',
 'opened',
 'in',
 '1942',
 ',',
 'critics',
 'praised',
 'its',
 'spare',
 ',',
 'haunting',
 'visual',
 'style',
 ',',
 'vastly',
 'different',
 'from',
 'anything',
 'Disney',
 'had',
 'done',
 'before',
 '.',
 'But',
 'what',
 'they',
 'did',
 'not',
 'know',
 'was',
 'that',
 'the',
 'film',
 '’',
 's',
 'striking',
 'appearance',
 'had',
 'been',
 'created',
 'by',
 'a',
 'Chinese',
 'immigrant',
 'artist',
 ',',
 'who',
 'took',
 'as',
 'his',
 'inspiration',
 'the',
 'landscape',
 'paintings',
 'of',
 'the',
 'Song',
 'dynasty',
 '.',
 'The',
 'extent',
 'of',
 'his',
 'contribution',
 'to',
 '“',
 'Bambi',
 ',',
 '”',
 'which',
 'remains',
 'a',
 'mark',
 'for',
 'film',
 'animation',
 ',',
 'would',
 'not',
 'be',
 'widely',
 'known',
 'for',
 'decades',
 '.',
 'Like',
 'the',
 'film',
 '’',
 's',
 'title',
 'character',
 ',',
 'the',
 'artist',
 ',',
 'Tyrus',
 'Wong',
 ',',
 'weathered',
 'irrevocable',
 'separa