In [None]:
import csv
import os
from collections import defaultdict
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors # load the Stanford GloVe model
#nltk.download()

In [None]:
os.getcwd()
os.chdir("C:\\Users\\Naini\\final-project\\News-Headline-Generation\\data")

### 1. Basic Feature Extraction

In [None]:
#reading csv
train = pd.read_csv('articles_small.csv')

In [None]:
#Number of Words
train['word_count'] = train['title'].apply(lambda x: len(str(x).split(" ")))
train[['title','word_count']]

In [None]:
#Number of characters
train['char_count'] = train['title'].str.len() ## this also includes spaces
train[['title','char_count']].head()

In [None]:
# Average Word Length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [None]:
train['avg_word'] = train['title'].apply(lambda x: avg_word(x))
train[['title','avg_word']].head()

In [None]:
#Number of stopwords
stop = stopwords.words('english')

In [None]:
train['stopwords'] = train['title'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['title','stopwords']].head()

In [None]:
#Number of numerics
train['numerics'] = train['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['title','numerics']].head()

In [None]:
#Number of Uppercase words
train['upper'] = train['title'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['title','upper']].head()

### 2. Basic Pre-processing

In [None]:
#transform data into lower case
train['title'] = train['title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['title'].head()

In [None]:
#Removing Punctuation
train['title'] = train['title'].str.replace('[^\w\s]','')
train['title'].head()

In [None]:
#Removal of Stop Words
train['title'] = train['title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['title'].head()

In [None]:
#Common word removal
freq = pd.Series(' '.join(train['title']).split()).value_counts()[:10]
freq

In [None]:
freq = list(freq.index)
train['title'] = train['title'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['title'].head()

In [None]:
#Rare words removal
rare = pd.Series(' '.join(train['title']).split()).value_counts()[-10:]
rare

In [None]:
rare = list(rare.index)
train['title'] = train['title'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
train['title'].head()

In [None]:
#Spelling correction
train['title'].apply(lambda x: str(TextBlob(x).correct()))

In [None]:
#Tokenization - dividing the text into a sequence of words or sentences
#we have used the textblob library to first transform our data into a blob and then converted them into a series of words
TextBlob(train['title'][1]).words


In [None]:
#Stemming -  removal of suffices, like “ing”, “ly”, “s”
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['title'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [None]:
#Lemmatization - it converts the word into its root word
from textblob import Word
train['title'] = train['title'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['title'].head()

### 3. Advance Text Processing
 

In [None]:
#N-grams - combination of multiple words used together.
TextBlob(train['title'][0]).ngrams(2)


In [None]:
# Term frequency - ratio of the count of a word present in a sentence, to the length of the sentence
tf1 = (train['title'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

In [None]:
#Inverse Document Frequency - log of the ratio of the total number of rows to the number of rows in which that word is present
import numpy as np
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['title'].str.contains(word)])))

tf1

In [None]:
#Term Frequency – Inverse Document Frequency (TF-IDF) - multiplication of the TF and IDF 
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['title'])

train_vect


In [None]:
#Bag of Words - representation of text which describes the presence of words within the text data
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['title'])
train_bow


In [None]:
# Word Embeddings
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
# convert it into the word2vec format
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
#load the above word2vec file as a model
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [None]:
model['go']


In [None]:
model['away']


In [None]:
# take the average to represent the string ‘go away’ in the form of vectors having 100 dimensions
(model['go'] + model['away'])/2