In [200]:
#Preprocessing Steps

#remove stopwords
#remove white space and punctuation
#lemmatized words
#ngrams

In [201]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [202]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [203]:
#checking the columns of the dataset
file_path = '/content/drive/My Drive/ML Bootcamp/Final Project'
features = pd.read_csv(file_path + '/features.csv')
print(list(features))

['title', 'text']


In [204]:
#converting everything to lowercase
features['title'] = features['title'].str.lower()
features['text'] = features['text'].str.lower()

In [205]:
#make new columns in our dataframe with all tokenized words, remove whitespace and punctuation
def identify_tokens_for_title(row):
    title = row['title']
    tokens_for_title = nltk.word_tokenize(title)
    # taken only words and numbers (not punctuation)
    token_words_for_title = [w for w in tokens_for_title if w.isalnum()]
    return token_words_for_title

def identify_tokens_for_text(row):
    text = row['text']
    tokens_for_text = nltk.word_tokenize(text)
    # taken only words and numbers (not punctuation)
    token_words_for_text = [w for w in tokens_for_text if w.isalnum()]
    return token_words_for_text

features['title_tokenized'] = features.apply(identify_tokens_for_title, axis=1)
features['text_tokenized'] = features.apply(identify_tokens_for_text, axis=1)

features

Unnamed: 0,title,text,title_tokenized,text_tokenized
0,donald trump sends out embarrassing new year’...,donald trump just couldn t wish all americans ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, just, couldn, t, wish, all, am..."
1,drunk bragging trump staffer started russian ...,house intelligence committee chairman devin nu...,"[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev..."
2,sheriff david clarke becomes an internet joke...,"on friday, it was revealed that former milwauk...","[sheriff, david, clarke, becomes, an, internet...","[on, friday, it, was, revealed, that, former, ..."
3,trump is so obsessed he even has obama’s name...,"on christmas day, donald trump announced that ...","[trump, is, so, obsessed, he, even, has, obama...","[on, christmas, day, donald, trump, announced,..."
4,pope francis just called out donald trump dur...,pope francis used his annual christmas day mes...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, used, his, annual, christmas, ..."
...,...,...,...,...
44262,'fully committed' nato backs new u.s. approach...,brussels (reuters) - nato allies on tuesday we...,"[committed, nato, backs, new, approach, on, af...","[brussels, reuters, nato, allies, on, tuesday,..."
44263,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...","[lexisnexis, withdrew, two, products, from, ch...","[london, reuters, lexisnexis, a, provider, of,..."
44264,minsk cultural hub becomes haven from authorities,minsk (reuters) - in the shadow of disused sov...,"[minsk, cultural, hub, becomes, haven, from, a...","[minsk, reuters, in, the, shadow, of, disused,..."
44265,vatican upbeat on possibility of pope francis ...,moscow (reuters) - vatican secretary of state ...,"[vatican, upbeat, on, possibility, of, pope, f...","[moscow, reuters, vatican, secretary, of, stat..."


In [206]:
#removing stopwords
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))                  

def remove_stops_for_title(row):
    my_list = row['title_tokenized']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)

def remove_stops_for_text(row):
    my_list = row['text_tokenized']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)

features['title_no_stopwords'] = features.apply(remove_stops_for_title, axis=1)
features['text_no_stopwords'] = features.apply(remove_stops_for_text, axis=1)

features

Unnamed: 0,title,text,title_tokenized,text_tokenized,title_no_stopwords,text_no_stopwords
0,donald trump sends out embarrassing new year’...,donald trump just couldn t wish all americans ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, just, couldn, t, wish, all, am...","[donald, trump, sends, embarrassing, new, year...","[donald, trump, wish, americans, happy, new, y..."
1,drunk bragging trump staffer started russian ...,house intelligence committee chairman devin nu...,"[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev...","[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev..."
2,sheriff david clarke becomes an internet joke...,"on friday, it was revealed that former milwauk...","[sheriff, david, clarke, becomes, an, internet...","[on, friday, it, was, revealed, that, former, ...","[sheriff, david, clarke, becomes, internet, jo...","[friday, revealed, former, milwaukee, sheriff,..."
3,trump is so obsessed he even has obama’s name...,"on christmas day, donald trump announced that ...","[trump, is, so, obsessed, he, even, has, obama...","[on, christmas, day, donald, trump, announced,...","[trump, obsessed, even, obama, name, coded, we...","[christmas, day, donald, trump, announced, wou..."
4,pope francis just called out donald trump dur...,pope francis used his annual christmas day mes...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, used, his, annual, christmas, ...","[pope, francis, called, donald, trump, christm...","[pope, francis, used, annual, christmas, day, ..."
...,...,...,...,...,...,...
44262,'fully committed' nato backs new u.s. approach...,brussels (reuters) - nato allies on tuesday we...,"[committed, nato, backs, new, approach, on, af...","[brussels, reuters, nato, allies, on, tuesday,...","[committed, nato, backs, new, approach, afghan...","[brussels, reuters, nato, allies, tuesday, wel..."
44263,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...","[lexisnexis, withdrew, two, products, from, ch...","[london, reuters, lexisnexis, a, provider, of,...","[lexisnexis, withdrew, two, products, chinese,...","[london, reuters, lexisnexis, provider, legal,..."
44264,minsk cultural hub becomes haven from authorities,minsk (reuters) - in the shadow of disused sov...,"[minsk, cultural, hub, becomes, haven, from, a...","[minsk, reuters, in, the, shadow, of, disused,...","[minsk, cultural, hub, becomes, authorities]","[minsk, reuters, shadow, disused, factories, m..."
44265,vatican upbeat on possibility of pope francis ...,moscow (reuters) - vatican secretary of state ...,"[vatican, upbeat, on, possibility, of, pope, f...","[moscow, reuters, vatican, secretary, of, stat...","[vatican, upbeat, possibility, pope, francis, ...","[moscow, reuters, vatican, secretary, state, c..."


In [207]:
#lemmatizing words
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

def lemmatize_list_for_title(row):
    my_list = row['title_tokenized']
    lemmatized_list = [lemmatizer.lemmatize(word) for word in my_list]
    return (lemmatized_list)

def lemmatize_list_for_text(row):
    my_list = row['text_tokenized']
    lemmatized_list = [lemmatizer.lemmatize(word) for word in my_list]
    return (lemmatized_list)

features['lemmatized_title'] = features.apply(lemmatize_list_for_title, axis=1)
features['lemmatized_text'] = features.apply(lemmatize_list_for_text, axis=1)

features

Unnamed: 0,title,text,title_tokenized,text_tokenized,title_no_stopwords,text_no_stopwords,lemmatized_title,lemmatized_text
0,donald trump sends out embarrassing new year’...,donald trump just couldn t wish all americans ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, just, couldn, t, wish, all, am...","[donald, trump, sends, embarrassing, new, year...","[donald, trump, wish, americans, happy, new, y...","[donald, trump, sends, out, embarrassing, new,...","[donald, trump, just, couldn, t, wish, all, am..."
1,drunk bragging trump staffer started russian ...,house intelligence committee chairman devin nu...,"[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev...","[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev...","[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev..."
2,sheriff david clarke becomes an internet joke...,"on friday, it was revealed that former milwauk...","[sheriff, david, clarke, becomes, an, internet...","[on, friday, it, was, revealed, that, former, ...","[sheriff, david, clarke, becomes, internet, jo...","[friday, revealed, former, milwaukee, sheriff,...","[sheriff, david, clarke, becomes, an, internet...","[on, friday, it, wa, revealed, that, former, m..."
3,trump is so obsessed he even has obama’s name...,"on christmas day, donald trump announced that ...","[trump, is, so, obsessed, he, even, has, obama...","[on, christmas, day, donald, trump, announced,...","[trump, obsessed, even, obama, name, coded, we...","[christmas, day, donald, trump, announced, wou...","[trump, is, so, obsessed, he, even, ha, obama,...","[on, christmas, day, donald, trump, announced,..."
4,pope francis just called out donald trump dur...,pope francis used his annual christmas day mes...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, used, his, annual, christmas, ...","[pope, francis, called, donald, trump, christm...","[pope, francis, used, annual, christmas, day, ...","[pope, francis, just, called, out, donald, tru...","[pope, francis, used, his, annual, christmas, ..."
...,...,...,...,...,...,...,...,...
44262,'fully committed' nato backs new u.s. approach...,brussels (reuters) - nato allies on tuesday we...,"[committed, nato, backs, new, approach, on, af...","[brussels, reuters, nato, allies, on, tuesday,...","[committed, nato, backs, new, approach, afghan...","[brussels, reuters, nato, allies, tuesday, wel...","[committed, nato, back, new, approach, on, afg...","[brussels, reuters, nato, ally, on, tuesday, w..."
44263,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...","[lexisnexis, withdrew, two, products, from, ch...","[london, reuters, lexisnexis, a, provider, of,...","[lexisnexis, withdrew, two, products, chinese,...","[london, reuters, lexisnexis, provider, legal,...","[lexisnexis, withdrew, two, product, from, chi...","[london, reuters, lexisnexis, a, provider, of,..."
44264,minsk cultural hub becomes haven from authorities,minsk (reuters) - in the shadow of disused sov...,"[minsk, cultural, hub, becomes, haven, from, a...","[minsk, reuters, in, the, shadow, of, disused,...","[minsk, cultural, hub, becomes, authorities]","[minsk, reuters, shadow, disused, factories, m...","[minsk, cultural, hub, becomes, haven, from, a...","[minsk, reuters, in, the, shadow, of, disused,..."
44265,vatican upbeat on possibility of pope francis ...,moscow (reuters) - vatican secretary of state ...,"[vatican, upbeat, on, possibility, of, pope, f...","[moscow, reuters, vatican, secretary, of, stat...","[vatican, upbeat, possibility, pope, francis, ...","[moscow, reuters, vatican, secretary, state, c...","[vatican, upbeat, on, possibility, of, pope, f...","[moscow, reuters, vatican, secretary, of, stat..."


In [208]:
def rejoin_words_in_title(row):
    my_list = row['lemmatized_title']
    joined_words = ( " ".join(my_list))
    return joined_words

def rejoin_words_in_text(row):
    my_list = row['lemmatized_text']
    joined_words = ( " ".join(my_list))
    return joined_words

features['processed_title'] = features.apply(rejoin_words_in_title, axis=1)
features['processed_text'] = features.apply(rejoin_words_in_text, axis=1)

features

Unnamed: 0,title,text,title_tokenized,text_tokenized,title_no_stopwords,text_no_stopwords,lemmatized_title,lemmatized_text,processed_title,processed_text
0,donald trump sends out embarrassing new year’...,donald trump just couldn t wish all americans ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, just, couldn, t, wish, all, am...","[donald, trump, sends, embarrassing, new, year...","[donald, trump, wish, americans, happy, new, y...","[donald, trump, sends, out, embarrassing, new,...","[donald, trump, just, couldn, t, wish, all, am...",donald trump sends out embarrassing new year s...,donald trump just couldn t wish all american a...
1,drunk bragging trump staffer started russian ...,house intelligence committee chairman devin nu...,"[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev...","[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev...","[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev...",drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...
2,sheriff david clarke becomes an internet joke...,"on friday, it was revealed that former milwauk...","[sheriff, david, clarke, becomes, an, internet...","[on, friday, it, was, revealed, that, former, ...","[sheriff, david, clarke, becomes, internet, jo...","[friday, revealed, former, milwaukee, sheriff,...","[sheriff, david, clarke, becomes, an, internet...","[on, friday, it, wa, revealed, that, former, m...",sheriff david clarke becomes an internet joke ...,on friday it wa revealed that former milwaukee...
3,trump is so obsessed he even has obama’s name...,"on christmas day, donald trump announced that ...","[trump, is, so, obsessed, he, even, has, obama...","[on, christmas, day, donald, trump, announced,...","[trump, obsessed, even, obama, name, coded, we...","[christmas, day, donald, trump, announced, wou...","[trump, is, so, obsessed, he, even, ha, obama,...","[on, christmas, day, donald, trump, announced,...",trump is so obsessed he even ha obama s name c...,on christmas day donald trump announced that h...
4,pope francis just called out donald trump dur...,pope francis used his annual christmas day mes...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, used, his, annual, christmas, ...","[pope, francis, called, donald, trump, christm...","[pope, francis, used, annual, christmas, day, ...","[pope, francis, just, called, out, donald, tru...","[pope, francis, used, his, annual, christmas, ...",pope francis just called out donald trump duri...,pope francis used his annual christmas day mes...
...,...,...,...,...,...,...,...,...,...,...
44262,'fully committed' nato backs new u.s. approach...,brussels (reuters) - nato allies on tuesday we...,"[committed, nato, backs, new, approach, on, af...","[brussels, reuters, nato, allies, on, tuesday,...","[committed, nato, backs, new, approach, afghan...","[brussels, reuters, nato, allies, tuesday, wel...","[committed, nato, back, new, approach, on, afg...","[brussels, reuters, nato, ally, on, tuesday, w...",committed nato back new approach on afghanistan,brussels reuters nato ally on tuesday welcomed...
44263,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...","[lexisnexis, withdrew, two, products, from, ch...","[london, reuters, lexisnexis, a, provider, of,...","[lexisnexis, withdrew, two, products, chinese,...","[london, reuters, lexisnexis, provider, legal,...","[lexisnexis, withdrew, two, product, from, chi...","[london, reuters, lexisnexis, a, provider, of,...",lexisnexis withdrew two product from chinese m...,london reuters lexisnexis a provider of legal ...
44264,minsk cultural hub becomes haven from authorities,minsk (reuters) - in the shadow of disused sov...,"[minsk, cultural, hub, becomes, haven, from, a...","[minsk, reuters, in, the, shadow, of, disused,...","[minsk, cultural, hub, becomes, authorities]","[minsk, reuters, shadow, disused, factories, m...","[minsk, cultural, hub, becomes, haven, from, a...","[minsk, reuters, in, the, shadow, of, disused,...",minsk cultural hub becomes haven from authority,minsk reuters in the shadow of disused factory...
44265,vatican upbeat on possibility of pope francis ...,moscow (reuters) - vatican secretary of state ...,"[vatican, upbeat, on, possibility, of, pope, f...","[moscow, reuters, vatican, secretary, of, stat...","[vatican, upbeat, possibility, pope, francis, ...","[moscow, reuters, vatican, secretary, state, c...","[vatican, upbeat, on, possibility, of, pope, f...","[moscow, reuters, vatican, secretary, of, stat...",vatican upbeat on possibility of pope francis ...,moscow reuters vatican secretary of state card...


In [209]:
cols_to_drop = ['title', 'text', 'title_tokenized', 'text_tokenized', 'title_no_stopwords', 'text_no_stopwords', 'lemmatized_title', 'lemmatized_text']
features.drop(cols_to_drop, axis =1, inplace=True)

features

Unnamed: 0,processed_title,processed_text
0,donald trump sends out embarrassing new year s...,donald trump just couldn t wish all american a...
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...
2,sheriff david clarke becomes an internet joke ...,on friday it wa revealed that former milwaukee...
3,trump is so obsessed he even ha obama s name c...,on christmas day donald trump announced that h...
4,pope francis just called out donald trump duri...,pope francis used his annual christmas day mes...
...,...,...
44262,committed nato back new approach on afghanistan,brussels reuters nato ally on tuesday welcomed...
44263,lexisnexis withdrew two product from chinese m...,london reuters lexisnexis a provider of legal ...
44264,minsk cultural hub becomes haven from authority,minsk reuters in the shadow of disused factory...
44265,vatican upbeat on possibility of pope francis ...,moscow reuters vatican secretary of state card...


In [199]:
features.to_csv('features_processed.csv', index=False)
!cp features_processed.csv drive/My\ Drive/ML\ Bootcamp/Final\ Project

In [210]:
from google.colab import files
files.download("features_processed.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>