In [62]:
import csv
import os
from collections import defaultdict
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors # load the Stanford GloVe model
#nltk.download()

In [63]:
os.getcwd()
os.chdir("C:\\Users\\Naini\\final-project\\News-Headline-Generation\\data")

In [None]:
#columns = defaultdict(list) # each value in each column is appended to a list


In [None]:
'''with open('articles_small.csv', errors='ignore') as f:
    reader = csv.DictReader(f) # read rows into a dictionary format
    for row in reader: # read a row as {column1: value1, column2: value2,...}
        for (k,v) in row.items(): # go over each column name and value 
            columns[k].append(v) # append the value into the appropriate list
                                 # based on column name k '''
 

In [None]:
#print(type(columns['title']))
#print('---------------------------------------------------------------------------------------------------')   




In [None]:
''' for i,x in enumerate(columns['title']):
    print(x)
    print('---------------------------------------------------------------------------------------------------')  ''' 

### 1. Basic Feature Extraction

In [64]:
#reading csv
train = pd.read_csv('articles_small.csv')

In [65]:
#Number of Words
train['word_count'] = train['title'].apply(lambda x: len(str(x).split(" ")))
train[['title','word_count']]

Unnamed: 0,title,word_count
0,House Republicans Fret About Winning Their Hea...,14
1,Rift Between Officers and Residents as Killing...,16
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",16
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",15
4,Kim Jong-un Says North Korea Is Preparing to T...,16
5,"Sick With a Cold, Queen Elizabeth Misses New Y...",15
6,Taiwan’s President Accuses China of Renewed In...,12
7,"After ‘The Biggest Loser,’ Their Bodies Fought...",15
8,"First, a Mixtape. Then a Romance. - The New Yo...",11
9,Calling on Angels While Enduring the Trials of...,14


In [66]:
#Number of characters
train['char_count'] = train['title'].str.len() ## this also includes spaces
train[['title','char_count']].head()

Unnamed: 0,title,char_count
0,House Republicans Fret About Winning Their Hea...,80
1,Rift Between Officers and Residents as Killing...,91
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",84
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",68
4,Kim Jong-un Says North Korea Is Preparing to T...,89


In [67]:
# Average Word Length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [68]:
train['avg_word'] = train['title'].apply(lambda x: avg_word(x))
train[['title','avg_word']].head()

Unnamed: 0,title,avg_word
0,House Republicans Fret About Winning Their Hea...,4.785714
1,Rift Between Officers and Residents as Killing...,4.75
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",4.3125
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",3.6
4,Kim Jong-un Says North Korea Is Preparing to T...,4.625


In [69]:
#Number of stopwords
stop = stopwords.words('english')

In [70]:
train['stopwords'] = train['title'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['title','stopwords']].head()

Unnamed: 0,title,stopwords
0,House Republicans Fret About Winning Their Hea...,0
1,Rift Between Officers and Residents as Killing...,3
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",2
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",3
4,Kim Jong-un Says North Korea Is Preparing to T...,1


In [71]:
#Number of numerics
train['numerics'] = train['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['title','numerics']].head()

Unnamed: 0,title,numerics
0,House Republicans Fret About Winning Their Hea...,0
1,Rift Between Officers and Residents as Killing...,0
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",1
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",0
4,Kim Jong-un Says North Korea Is Preparing to T...,0


In [72]:
#Number of Uppercase words
train['upper'] = train['title'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['title','upper']].head()

Unnamed: 0,title,upper
0,House Republicans Fret About Winning Their Hea...,0
1,Rift Between Officers and Residents as Killing...,0
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",0
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",0
4,Kim Jong-un Says North Korea Is Preparing to T...,0


### 2. Basic Pre-processing

In [73]:
#transform data into lower case
train['title'] = train['title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['title'].head()

0    house republicans fret about winning their hea...
1    rift between officers and residents as killing...
2    tyrus wong, ‘bambi’ artist thwarted by racial ...
3    among deaths in 2016, a heavy toll in pop musi...
4    kim jong-un says north korea is preparing to t...
Name: title, dtype: object

In [74]:
#Removing Punctuation
train['title'] = train['title'].str.replace('[^\w\s]','')
train['title'].head()

0    house republicans fret about winning their hea...
1    rift between officers and residents as killing...
2    tyrus wong bambi artist thwarted by racial bia...
3    among deaths in 2016 a heavy toll in pop music...
4    kim jongun says north korea is preparing to te...
Name: title, dtype: object

In [14]:
#Removal of Stop Words
train['title'] = train['title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['title'].head()

0    house republicans fret winning health care sui...
1    rift officers residents killings persist south...
2    tyrus wong bambi artist thwarted racial bias d...
3    among deaths 2016 heavy toll pop music new yor...
4    kim jongun says north korea preparing test lon...
Name: title, dtype: object

In [19]:
#Common word removal
freq = pd.Series(' '.join(train['title']).split()).value_counts()[:10]
freq

briefing    2
big         2
subway      2
first       2
syrias      2
fox         2
france      2
kills       2
benjamin    2
could       2
dtype: int64

In [20]:
freq = list(freq.index)
train['title'] = train['title'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['title'].head()

0                                    fret winning suit
1    rift officers residents killings persist south...
2    tyrus wong bambi artist thwarted racial bias d...
3               among deaths 2016 heavy toll pop music
4    kim jongun north korea preparing test longrang...
Name: title, dtype: object

In [21]:
#Rare words removal
rare = pd.Series(' '.join(train['title']).split()).value_counts()[-10:]
rare

residents      1
anew           1
stonewalled    1
official       1
heavy          1
jonathan       1
prompting      1
fecal          1
twitter        1
faster         1
dtype: int64

In [22]:
rare = list(rare.index)
train['title'] = train['title'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
train['title'].head()

0                                    fret winning suit
1           rift officers killings persist south bronx
2    tyrus wong bambi artist thwarted racial bias d...
3                     among deaths 2016 toll pop music
4    kim jongun north korea preparing test longrang...
Name: title, dtype: object

In [23]:
#Spelling correction
train['title'].apply(lambda x: str(TextBlob(x).correct()))

0                                     fret winning suit
1              rift officers killing persist south bone
2     cyrus long baby artist hearted racial bias die...
3                      among deaths 2016 toll pop music
4     him tongue north more preparing test longrange...
5              sick cold queen elizabeth misses service
6                  taiwans accuses renewed intimidation
7             biggest lower bodies fought regain weight
8                                       mistake romance
9                    calling angels enduring trials job
10    weak federal powers limit troops climatepolicy...
11                    carbon capture technology prosper
12             maralago future winter white home calmer
13                              form healthy habits was
14                     turning vacation photo works art
15     second avenue opens train delay ends happy tears
16                  roof rejects best defense execution
17    modes cash ban brings pain corruptionweary

In [24]:
#Tokenization - dividing the text into a sequence of words or sentences
#we have used the textblob library to first transform our data into a blob and then converted them into a series of words
TextBlob(train['title'][1]).words


WordList(['rift', 'officers', 'killings', 'persist', 'south', 'bronx'])

In [26]:
#Stemming -  removal of suffices, like “ing”, “ly”, “s”
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['title'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                                         fret win suit
1                   rift offic kill persist south bronx
2      tyru wong bambi artist thwart racial bia die 106
3                       among death 2016 toll pop music
4     kim jongun north korea prepar test longrang mi...
5                 sick cold queen elizabeth miss servic
6                            taiwan accus renew intimid
7               biggest loser bodi fought regain weight
8                                         mixtap romanc
9                            call angel endur trial job
10    weak feder power limit trump climatepolici rol...
11                      carbon captur technolog prosper
12              maralago futur winter white home calmer
13                                form healthi habit 20
14                            turn vacat photo work art
15         second avenu open train delay end happi tear
16                       roof reject best defens execut
17    modi cash ban bring pain corruptionweari i

In [28]:
#Lemmatization - it converts the word into its root word
from textblob import Word
train['title'] = train['title'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['title'].head()

0                                    fret winning suit
1             rift officer killing persist south bronx
2    tyrus wong bambi artist thwarted racial bias d...
3                      among death 2016 toll pop music
4    kim jongun north korea preparing test longrang...
Name: title, dtype: object

### 3. Advance Text Processing
 

In [29]:
#N-grams - combination of multiple words used together.
TextBlob(train['title'][0]).ngrams(2)


[WordList(['fret', 'winning']), WordList(['winning', 'suit'])]

In [33]:
# Term frequency - ratio of the count of a word present in a sentence, to the length of the sentence
tf1 = (train['title'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,rift,1
1,bronx,1
2,persist,1
3,south,1
4,killing,1
5,officer,1


In [35]:
#Inverse Document Frequency - log of the ratio of the total number of rows to the number of rows in which that word is present
import numpy as np
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['title'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,rift,1,4.564348
1,bronx,1,4.564348
2,persist,1,4.564348
3,south,1,4.564348
4,killing,1,4.564348
5,officer,1,4.564348


In [36]:
#Term Frequency – Inverse Document Frequency (TF-IDF) - multiplication of the TF and IDF 
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1


Unnamed: 0,words,tf,idf,tfidf
0,rift,1,4.564348,4.564348
1,bronx,1,4.564348,4.564348
2,persist,1,4.564348,4.564348
3,south,1,4.564348,4.564348
4,killing,1,4.564348,4.564348
5,officer,1,4.564348,4.564348


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['title'])

train_vect


<96x445 sparse matrix of type '<class 'numpy.float64'>'
	with 490 stored elements in Compressed Sparse Row format>

In [39]:
#Bag of Words - representation of text which describes the presence of words within the text data
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['title'])
train_bow


<96x470 sparse matrix of type '<class 'numpy.int64'>'
	with 522 stored elements in Compressed Sparse Row format>

In [53]:
# Word Embeddings
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
# convert it into the word2vec format
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [55]:
#load the above word2vec file as a model
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [56]:
model['go']


array([-0.078894,  0.4616  ,  0.57779 , -0.71637 , -0.13121 ,  0.4186  ,
       -0.29156 ,  0.52006 ,  0.089986, -0.35062 ,  0.51755 ,  0.51998 ,
        0.15218 ,  0.41485 , -0.12377 , -0.37222 ,  0.0273  ,  0.75673 ,
       -0.8739  ,  0.58935 ,  0.46662 ,  0.62918 ,  0.092603, -0.012868,
       -0.015169,  0.25567 , -0.43025 , -0.77668 ,  0.71449 , -0.3834  ,
       -0.69638 ,  0.23522 ,  0.11396 ,  0.02778 ,  0.071357,  0.87409 ,
       -0.1281  ,  0.063576,  0.067867, -0.50181 , -0.28523 , -0.072536,
       -0.50738 , -0.6914  , -0.53579 , -0.11361 , -0.38234 , -0.12414 ,
        0.011214, -1.1622  ,  0.037057, -0.18495 ,  0.01416 ,  0.87193 ,
       -0.097309, -2.3565  , -0.14554 ,  0.28275 ,  2.0053  ,  0.23439 ,
       -0.38298 ,  0.69539 , -0.44916 , -0.094157,  0.90527 ,  0.65764 ,
        0.27628 ,  0.30688 , -0.57781 , -0.22987 , -0.083043, -0.57236 ,
       -0.299   , -0.81112 ,  0.039752, -0.05681 , -0.48879 , -0.18091 ,
       -0.28152 , -0.20559 ,  0.4932  , -0.033999, 

In [57]:
model['away']


array([-0.10379 , -0.014792,  0.59933 , -0.51316 , -0.036463,  0.6588  ,
       -0.57906 ,  0.17819 ,  0.23663 , -0.21384 ,  0.55339 ,  0.53597 ,
        0.041444,  0.16095 ,  0.017093, -0.37242 ,  0.017974,  0.39268 ,
       -0.23265 ,  0.1818  ,  0.66405 ,  0.98163 ,  0.42339 ,  0.030581,
        0.35015 ,  0.25519 , -0.71182 , -0.42184 ,  0.13068 , -0.47452 ,
       -0.08175 ,  0.1574  , -0.13262 ,  0.22679 , -0.16885 , -0.11122 ,
       -0.32272 , -0.020978, -0.43345 ,  0.172   , -0.67366 , -0.79052 ,
        0.10556 , -0.4219  , -0.12385 , -0.063486, -0.17843 ,  0.56359 ,
        0.16986 , -0.17804 ,  0.13956 , -0.20169 ,  0.078985,  1.4497  ,
        0.23556 , -2.6014  , -0.5286  , -0.11636 ,  1.7184  ,  0.33254 ,
        0.12136 ,  1.1602  , -0.2914  ,  0.47125 ,  0.41869 ,  0.35271 ,
        0.47869 , -0.042281, -0.18294 ,  0.1796  , -0.24431 , -0.34042 ,
        0.20337 , -0.93676 ,  0.013077,  0.080339, -0.36604 , -0.44005 ,
       -0.35393 ,  0.15907 ,  0.55807 ,  0.1492  , 

In [59]:
# take the average to represent the string ‘go away’ in the form of vectors having 100 dimensions
(model['go'] + model['away'])/2

array([-0.091342  ,  0.223404  ,  0.58856   , -0.614765  , -0.0838365 ,
        0.5387    , -0.43531   ,  0.349125  ,  0.163308  , -0.28223   ,
        0.53547   ,  0.52797496,  0.096812  ,  0.2879    , -0.0533385 ,
       -0.37232   ,  0.022637  ,  0.574705  , -0.553275  ,  0.385575  ,
        0.565335  ,  0.805405  ,  0.2579965 ,  0.0088565 ,  0.1674905 ,
        0.25543   , -0.571035  , -0.59926   ,  0.422585  , -0.42896   ,
       -0.389065  ,  0.19631   , -0.00933   ,  0.127285  , -0.0487465 ,
        0.381435  , -0.22540998,  0.021299  , -0.1827915 , -0.16490501,
       -0.47944498, -0.431528  , -0.20091   , -0.55665   , -0.32982   ,
       -0.088548  , -0.28038502,  0.219725  ,  0.090537  , -0.67012   ,
        0.0883085 , -0.19332   ,  0.0465725 ,  1.160815  ,  0.0691255 ,
       -2.47895   , -0.33707   ,  0.083195  ,  1.86185   ,  0.283465  ,
       -0.13081   ,  0.927795  , -0.37028   ,  0.1885465 ,  0.66198   ,
        0.505175  ,  0.37748498,  0.1322995 , -0.380375  , -0.02