In [1]:
import csv
import os
from collections import defaultdict
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors # load the Stanford GloVe model
#nltk.download()



In [2]:
os.getcwd()
os.chdir("C:\\Users\\Naini\\final-project\\News-Headline-Generation\\data")

In [None]:
#columns = defaultdict(list) # each value in each column is appended to a list


In [None]:
'''with open('articles_small.csv', errors='ignore') as f:
    reader = csv.DictReader(f) # read rows into a dictionary format
    for row in reader: # read a row as {column1: value1, column2: value2,...}
        for (k,v) in row.items(): # go over each column name and value 
            columns[k].append(v) # append the value into the appropriate list
                                 # based on column name k '''
 

In [None]:
#print(type(columns['title']))
#print('---------------------------------------------------------------------------------------------------')   




In [None]:
''' for i,x in enumerate(columns['title']):
    print(x)
    print('---------------------------------------------------------------------------------------------------')  ''' 

### 1. Basic Feature Extraction

In [3]:
#reading csv
train = pd.read_csv('articles_small.csv')

In [4]:
#Number of Words
train['word_count'] = train['content'].apply(lambda x: len(str(x).split(" ")))
train[['content','word_count']].head()

Unnamed: 0,content,word_count
0,WASHINGTON — Congressional Republicans have...,920
1,"After the bullet shells get counted, the blood...",4905
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",2521
3,"Death may be the great equalizer, but it isn’t...",2212
4,"SEOUL, South Korea — North Korea’s leader, ...",741


In [5]:
#Number of characters
train['char_count'] = train['content'].str.len() ## this also includes spaces
train[['content','char_count']].head()

Unnamed: 0,content,char_count
0,WASHINGTON — Congressional Republicans have...,5607
1,"After the bullet shells get counted, the blood...",27834
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",14018
3,"Death may be the great equalizer, but it isn’t...",12274
4,"SEOUL, South Korea — North Korea’s leader, ...",4195


In [6]:
# Average Word Length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [7]:
train['avg_word'] = train['content'].apply(lambda x: avg_word(x))
train[['content','avg_word']].head()

Unnamed: 0,content,avg_word
0,WASHINGTON — Congressional Republicans have...,5.303167
1,"After the bullet shells get counted, the blood...",4.807128
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",4.812892
3,"Death may be the great equalizer, but it isn’t...",4.748938
4,"SEOUL, South Korea — North Korea’s leader, ...",4.978386


In [8]:
#Number of stopwords
stop = stopwords.words('english')

In [9]:
train['stopwords'] = train['content'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['content','stopwords']].head()

Unnamed: 0,content,stopwords
0,WASHINGTON — Congressional Republicans have...,340
1,"After the bullet shells get counted, the blood...",1888
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",883
3,"Death may be the great equalizer, but it isn’t...",787
4,"SEOUL, South Korea — North Korea’s leader, ...",236


In [10]:
#Number of numerics
train['numerics'] = train['content'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['content','numerics']].head()

Unnamed: 0,content,numerics
0,WASHINGTON — Congressional Republicans have...,0
1,"After the bullet shells get counted, the blood...",56
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",11
3,"Death may be the great equalizer, but it isn’t...",13
4,"SEOUL, South Korea — North Korea’s leader, ...",5


In [11]:
#Number of Uppercase words
train['upper'] = train['content'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['content','upper']].head()

Unnamed: 0,content,upper
0,WASHINGTON — Congressional Republicans have...,5
1,"After the bullet shells get counted, the blood...",29
2,"When Walt Disney’s “Bambi” opened in 1942, cri...",14
3,"Death may be the great equalizer, but it isn’t...",16
4,"SEOUL, South Korea — North Korea’s leader, ...",9


### 2. Basic Pre-processing

In [12]:
#transform data into lower case
train['content'] = train['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['content'].head()

0    washington — congressional republicans have a ...
1    after the bullet shells get counted, the blood...
2    when walt disney’s “bambi” opened in 1942, cri...
3    death may be the great equalizer, but it isn’t...
4    seoul, south korea — north korea’s leader, kim...
Name: content, dtype: object

In [13]:
#Removing Punctuation
train['content'] = train['content'].str.replace('[^\w\s]','')
train['content'].head()

0    washington  congressional republicans have a n...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in 1942 critics...
3    death may be the great equalizer but it isnt n...
4    seoul south korea  north koreas leader kim sai...
Name: content, dtype: object

In [14]:
#Removal of Stop Words
train['content'] = train['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['content'].head()

0    washington congressional republicans new fear ...
1    bullet shells get counted blood dries votive c...
2    walt disneys bambi opened 1942 critics praised...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north koreas leader kim said...
Name: content, dtype: object

In [15]:
#Common word removal
freq = pd.Series(' '.join(train['content']).split()).value_counts()[:10]
freq

mr        985
said      834
new       311
one       280
would     260
trump     241
ms        230
people    229
like      210
years     200
dtype: int64

In [16]:
freq = list(freq.index)
train['content'] = train['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['content'].head()

0    washington congressional republicans fear come...
1    bullet shells get counted blood dries votive c...
2    walt disneys bambi opened 1942 critics praised...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north koreas leader kim sund...
Name: content, dtype: object

In [17]:
#Rare words removal
rare = pd.Series(' '.join(train['content']).split()).value_counts()[-10:]
rare

jeopardy        1
wei             1
vineyard        1
scarred         1
dress           1
backfire        1
hatabs          1
polluting       1
philharmonic    1
unity           1
dtype: int64

In [18]:
rare = list(rare.index)
train['content'] = train['content'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
train['content'].head()

0    washington congressional republicans fear come...
1    bullet shells get counted blood dries votive c...
2    walt disneys bambi opened 1942 critics praised...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north koreas leader kim sund...
Name: content, dtype: object

In [19]:
#Spelling correction
train['content'][:10].apply(lambda x: str(TextBlob(x).correct()))

0    washington congressional republicans fear come...
1    bullet shells get counted blood dries votive c...
2    walt dinners baby opened 1942 critics praised ...
3    death may great equalizer isn necessarily even...
4    soul south more north sores leader him sunday ...
5    london queen elizabeth ii rattling cold week m...
6    being president tsar taiwan sharply criticized...
7    dandy chill stood slightly dazed blizzard conf...
8    axillary herr founder digital media company lo...
9    angels everywhere music family apartment bone ...
Name: content, dtype: object

In [38]:
#Tokenization - dividing the text into a sequence of words or sentences
#we have used the textblob library to first transform our data into a blob and then converted them into a series of words
for i,x in enumerate(train['content']):
    TextBlob(train['content'][i]).words





In [39]:
#Stemming -  removal of suffices, like “ing”, “ly”, “s”
st = PorterStemmer()
train['content'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0     washington congression republican fear come he...
1     bullet shell get count blood dri votiv candl b...
2     walt disney bambi open 1942 critic prais spare...
3     death may great equal isnt necessarili evenhan...
4     seoul south korea north korea leader kim sunda...
5     london queen elizabeth ii battl cold week miss...
6     beij presid tsai taiwan sharpli critic china l...
7     danni cahil stood slightli daze blizzard confe...
8     hillari kerr founder digit media compani lo an...
9     angel everywher muñiz famili apart bronx paint...
10    donald j take control white hous seem dark tim...
11    thompson tex promis troubl technolog fight glo...
12    west palm beach fla donald j rang year weekend...
13    articl part seri aim help navig life opportun ...
14    season famili travel photo perhap enlarg imag ...
15    final second avenu subway open york citi sunda...
16    page journal found dylann roof car assert blac...
17    mumbai india bold riski gambl prime minist

In [40]:
#Lemmatization - it converts the word into its root word
train['content'] = train['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['content'].head()

0    washington congressional republican fear come ...
1    bullet shell get counted blood dry votive cand...
2    walt disney bambi opened 1942 critic praised s...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north korea leader kim sunda...
Name: content, dtype: object

### 3. Advance Text Processing
 

In [46]:
#N-grams - combination of multiple words used together.
for i,x in enumerate(train['content']):
    TextBlob(train['content'][i]).ngrams(2)




In [48]:
# Term frequency - ratio of the count of a word present in a sentence, to the length of the sentence
tf1 = (train['content']).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,0,13.0
1,000,49.0
2,025,1.0
3,053,1.0
4,1,51.0
5,10,39.0
6,100,25.0
7,101,1.0
8,1036,1.0
9,104,1.0


In [49]:
#Inverse Document Frequency - log of the ratio of the total number of rows to the number of rows in which that word is present
import numpy as np
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['content'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,0,13.0,0.053489
1,000,49.0,1.428854
2,025,1.0,3.871201
3,053,1.0,4.564348
4,1,51.0,0.064539
5,10,39.0,0.632523
6,100,25.0,1.519826
7,101,1.0,4.564348
8,1036,1.0,4.564348
9,104,1.0,4.564348


In [50]:
#Term Frequency – Inverse Document Frequency (TF-IDF) - multiplication of the TF and IDF 
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1


Unnamed: 0,words,tf,idf,tfidf
0,0,13.0,0.053489,0.695353
1,000,49.0,1.428854,70.013845
2,025,1.0,3.871201,3.871201
3,053,1.0,4.564348,4.564348
4,1,51.0,0.064539,3.291465
5,10,39.0,0.632523,24.668380
6,100,25.0,1.519826,37.995644
7,101,1.0,4.564348,4.564348
8,1036,1.0,4.564348,4.564348
9,104,1.0,4.564348,4.564348


In [51]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['content'])

train_vect


<96x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 14953 stored elements in Compressed Sparse Row format>

In [52]:
#Bag of Words - representation of text which describes the presence of words within the text data
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['content'])
train_bow


<96x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 16960 stored elements in Compressed Sparse Row format>

In [53]:
# Word Embeddings
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
# convert it into the word2vec format
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [54]:
#load the above word2vec file as a model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [55]:
model['music']


array([ 1.2705e-01,  3.7253e-01,  2.3076e-01, -1.0727e-01,  1.4273e+00,
        7.7990e-01, -5.5009e-02, -1.8909e-01, -4.2828e-01,  2.8443e-01,
        1.1920e-01, -6.1978e-01, -3.9817e-01,  1.9127e-01, -3.2098e-01,
       -2.4116e-01,  6.0927e-01, -3.5765e-01,  2.0159e-01,  9.1107e-01,
       -4.8977e-01, -2.8970e-01,  1.0098e+00, -6.6149e-01,  7.0414e-02,
        1.3353e-01,  4.2234e-01, -8.0188e-02,  1.0722e+00,  2.6426e-01,
       -7.3982e-01,  6.4958e-01, -4.8367e-01,  8.5837e-01, -3.9008e-01,
       -1.3020e-01,  2.2331e-01,  2.0818e-01, -4.7260e-01, -1.7275e+00,
        3.7209e-01,  5.5544e-01, -8.9025e-01,  8.7442e-01, -2.9063e-01,
       -7.1175e-01, -3.3437e-01, -4.1141e-01,  9.9049e-01, -4.6207e-01,
       -2.5777e-01,  4.9513e-02, -2.2242e-01, -3.3757e-01, -1.5677e-01,
       -2.9576e+00, -5.8878e-01,  2.2432e-01,  1.6375e+00,  1.0263e+00,
        8.9573e-01,  1.3718e+00, -3.7709e-01, -8.2095e-02,  1.6339e-01,
       -2.5294e-01,  1.0987e+00, -2.0032e-03,  7.9235e-01, -2.94

In [56]:
model['family']


array([ 4.2179e-01, -9.6730e-02,  1.0657e-01, -2.1117e-01, -5.4202e-01,
        8.8692e-01, -1.4038e-01, -1.0424e-01, -1.6009e-01,  1.5360e-01,
       -3.7699e-01,  4.5063e-02,  4.4316e-01,  3.9670e-01, -5.7958e-01,
       -3.5208e-01,  5.2960e-01, -4.3271e-01, -2.1603e-01,  1.0731e+00,
       -3.3560e-01,  6.7252e-02,  5.7345e-01,  3.5972e-01,  5.6134e-01,
       -3.1222e-01, -5.8412e-01, -2.2302e-01,  8.1725e-02,  5.4772e-01,
        3.5482e-01,  8.9450e-01,  6.9674e-01,  6.2971e-02,  1.7604e-01,
        6.7448e-01,  5.9729e-01,  4.9058e-01,  5.2370e-01, -7.8386e-02,
       -5.2658e-01, -5.6530e-01,  4.5006e-01, -7.5849e-01, -5.2401e-02,
        5.1847e-02, -3.2363e-01,  7.1197e-01,  5.8238e-01, -7.0496e-01,
       -1.9225e-01, -1.0275e+00,  8.8209e-01,  6.8192e-01, -7.0748e-02,
       -1.7905e+00, -8.8179e-01, -8.5265e-01,  1.3588e+00,  1.0301e+00,
        2.6106e-01,  8.9355e-01,  4.3638e-01, -5.8021e-01,  1.3251e+00,
       -5.7793e-01, -1.4836e-01, -9.2882e-02,  3.8736e-01,  2.26

In [57]:
# take the average to represent the string ‘go away’ in the form of vectors having 100 dimensions
(model['music'] + model['family'])/2

array([ 0.27442   ,  0.13790001,  0.16866499, -0.15922001,  0.44263998,
        0.83341   , -0.0976945 , -0.146665  , -0.29418498,  0.219015  ,
       -0.128895  , -0.2873585 ,  0.022495  ,  0.293985  , -0.45028   ,
       -0.29662   ,  0.569435  , -0.39518   , -0.00722   ,  0.992085  ,
       -0.41268498, -0.111224  ,  0.791625  , -0.15088502,  0.315877  ,
       -0.089345  , -0.08088998, -0.151604  ,  0.5769625 ,  0.40599   ,
       -0.1925    ,  0.77204   ,  0.10653499,  0.4606705 , -0.10702001,
        0.27214003,  0.4103    ,  0.34938   ,  0.02554999, -0.90294296,
       -0.07724498, -0.00492999, -0.22009501,  0.05796498, -0.17151551,
       -0.3299515 , -0.329     ,  0.15027998,  0.786435  , -0.583515  ,
       -0.22501001, -0.48899353,  0.329835  ,  0.17217499, -0.113759  ,
       -2.3740501 , -0.735285  , -0.314165  ,  1.4981501 ,  1.0281999 ,
        0.578395  ,  1.1326749 ,  0.029645  , -0.3311525 ,  0.744245  ,
       -0.415435  ,  0.47517002, -0.0474426 ,  0.589855  , -0.03