In [2]:
import csv
import os
from collections import defaultdict
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors # load the Stanford GloVe model
#nltk.download()



In [3]:
os.getcwd()
os.chdir("C:\\Users\\Naini\\final-project\\News-Headline-Generation\\data")

### 1. Basic Feature Extraction

In [4]:
#reading csv
train = pd.read_csv('output.csv', encoding='ISO-8859-1',low_memory=False)

In [5]:
train = train[train['content'].notnull()]

In [6]:
train

Unnamed: 0,title,content
0,House Republicans Fret About Winning Their Hea...,WASHINGTON Ã¢ÂÂ Congressional Republicans...
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood..."
2,"Tyrus Wong, Ã¢ÂÂBambiÃ¢ÂÂ Artist Thwarted ...",When Walt DisneyÃ¢ÂÂs Ã¢ÂÂBambiÃ¢ÂÂ open...
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isnÃ¢..."
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea Ã¢ÂÂ North KoreaÃ¢ÂÂ..."
5,"Sick With a Cold, Queen Elizabeth Misses New Y...","LONDON Ã¢ÂÂ Queen Elizabeth II, who has b..."
6,TaiwanÃ¢ÂÂs President Accuses China of Renew...,BEIJING Ã¢ÂÂ President Tsai of Taiwan s...
7,"After Ã¢ÂÂThe Biggest Loser,Ã¢ÂÂ Their Bod...","Danny Cahill stood, slightly dazed, in a blizz..."
8,"First, a Mixtape. Then a Romance. - The New Yo...","Just how is Hillary Kerr, the founder of ..."
9,Calling on Angels While Enduring the Trials of...,Angels are everywhere in the MuÃÂ±iz familyÃ¢...


In [7]:
#Number of stopwords
stop = stopwords.words('english')

In [8]:
train['stopwords'] = train['content'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['content','stopwords']].head()

Unnamed: 0,content,stopwords
0,WASHINGTON Ã¢ÂÂ Congressional Republicans...,340
1,"After the bullet shells get counted, the blood...",1888
2,When Walt DisneyÃ¢ÂÂs Ã¢ÂÂBambiÃ¢ÂÂ open...,883
3,"Death may be the great equalizer, but it isnÃ¢...",787
4,"SEOUL, South Korea Ã¢ÂÂ North KoreaÃ¢ÂÂ...",236


### 2. Basic Pre-processing

In [9]:
#transform data into lower case
train['content'] = train['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['content'].head()

0    washington ã¢ââ congressional republicans ha...
1    after the bullet shells get counted, the blood...
2    when walt disneyã¢ââs ã¢ââbambiã¢ââ open...
3    death may be the great equalizer, but it isnã¢...
4    seoul, south korea ã¢ââ north koreaã¢ââs l...
Name: content, dtype: object

In [10]:
#Removing Punctuation
train['content'] = train['content'].str.replace('[^\w\s]','')
train['content']

0        washington ãââ congressional republicans have ...
1        after the bullet shells get counted the blood ...
2        when walt disneyãââs ãââbambiãââ opened in 194...
3        death may be the great equalizer but it isnãââ...
4        seoul south korea ãââ north koreaãââs leader k...
5        london ãââ queen elizabeth ii who has been bat...
6        beijing ãââ president tsai of taiwan sharply c...
7        danny cahill stood slightly dazed in a blizzar...
8        just how is hillary kerr the founder of a digi...
9        angels are everywhere in the muãâiz familyãââs...
10       with donald j trump about to take control of t...
11       thompsons tex ãââ can one of the most promisin...
12       west palm beach fla ãââ when donald j trump ra...
13       this article is part of a series aimed at help...
14       itãââs the season for family travel and photos...
15       finally the second avenue subway opened in new...
16       pages into the journal found in dylann s roofã.

In [11]:
#Removal of Stop Words
train['content'] = train['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['content'].head()

0    washington ãââ congressional republicans new f...
1    bullet shells get counted blood dries votive c...
2    walt disneyãââs ãââbambiãââ opened 1942 critic...
3    death may great equalizer isnãâât necessarily ...
4    seoul south korea ãââ north koreaãââs leader k...
Name: content, dtype: object

In [12]:
#Common word removal
freq = pd.Series(' '.join(train['content']).split()).value_counts()[:10]
freq

ãââ          408209
said         206231
trump        114363
mr            85617
would         73516
one           72906
people        71186
new           61662
also          56970
president     55776
dtype: int64

In [13]:
freq = list(freq.index)
train['content'] = train['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['content'].head()

0    washington congressional republicans fear come...
1    bullet shells get counted blood dries votive c...
2    walt disneyãââs ãââbambiãââ opened 1942 critic...
3    death may great equalizer isnãâât necessarily ...
4    seoul south korea north koreaãââs leader kim s...
Name: content, dtype: object

In [14]:
#Rare words removal
rare = pd.Series(' '.join(train['content']).split()).value_counts()[-10:]
rare

boondogles            1
turkishness           1
hentie                1
scamardo              1
bruel                 1
ãââpussyfootingãââ    1
judit                 1
bushrob               1
hamoodãââs            1
zhongãââs             1
dtype: int64

In [15]:
rare = list(rare.index)
train['content'] = train['content'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
train['content'].head()

0    washington congressional republicans fear come...
1    bullet shells get counted blood dries votive c...
2    walt disneyãââs ãââbambiãââ opened 1942 critic...
3    death may great equalizer isnãâât necessarily ...
4    seoul south korea north koreaãââs leader kim s...
Name: content, dtype: object

In [16]:
#Tokenization - dividing the text into a sequence of words or sentences
#we have used the textblob library to first transform our data into a blob and then converted them into a series of words
tokenized_words=[]
for i,x in enumerate(train['content']):
    if(len(x) > 1 ):
       tokenized_words = TextBlob(x).words

In [17]:
#Stemming -  removal of suffices, like “ing”, “ly”, “s”
st = PorterStemmer()
train['content'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        washington congression republican fear come he...
1        bullet shell get count blood dri votiv candl b...
2        walt disneyãââ ãââbambiãââ open 1942 critic pr...
3        death may great equal isnãâât necessarili even...
4        seoul south korea north koreaãââ leader kim su...
5        london queen elizabeth ii battl cold week miss...
6        beij tsai taiwan sharpli critic chinaãââ leade...
7        danni cahil stood slightli daze blizzard confe...
8        hillari kerr founder digit media compani lo an...
9        angel everywher muãâiz familyãââ apart bronx p...
10       donald j take control white hous seem dark tim...
11       thompson tex promis troubl technolog fight glo...
12       west palm beach fla donald j rang year weekend...
13       articl part seri aim help navig lifeãââ opport...
14       itãââ season famili travel photo perhap enlarg...
15       final second avenu subway open york citi sunda...
16       page journal found dylann roofãââ car assert b.

In [18]:
#Lemmatization - it converts the word into its root word
train['content'] = train['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['content'].head()

0    washington congressional republican fear come ...
1    bullet shell get counted blood dry votive cand...
2    walt disneyãââs ãââbambiãââ opened 1942 critic...
3    death may great equalizer isnãâât necessarily ...
4    seoul south korea north koreaãââs leader kim s...
Name: content, dtype: object

### 3. Advance Text Processing
 

In [20]:
#N-grams - combination of multiple words used together.
j=[]
for i,x in enumerate(train['content']):
    j = TextBlob(x).ngrams(2)




KeyboardInterrupt: 

In [None]:
# Term frequency - ratio of the count of a word present in a sentence, to the length of the sentence
tf1 = (train['content']).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

In [None]:
#Inverse Document Frequency - log of the ratio of the total number of rows to the number of rows in which that word is present
import numpy as np
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['content'].str.contains(word)])))

tf1

In [None]:
#Term Frequency – Inverse Document Frequency (TF-IDF) - multiplication of the TF and IDF 
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1


In [None]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['content'])

train_vect


In [None]:
#Bag of Words - representation of text which describes the presence of words within the text data
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['content'])
train_bow


In [None]:
# Word Embeddings
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
# convert it into the word2vec format
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
#load the above word2vec file as a model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [None]:
model['music']


In [None]:
model['family']


In [None]:
# take the average to represent the string ‘go away’ in the form of vectors having 100 dimensions
(model['music'] + model['family'])/2