In [84]:
!unzip ../../usr/local/share/nltk_data/corpora/wordnet.zip -d ../../usr/local/share/nltk_data/corpora/
!ls -r ../../usr/local/share/nltk_data/corpora/

Archive:  ../../usr/local/share/nltk_data/corpora/wordnet.zip
   creating: ../../usr/local/share/nltk_data/corpora/wordnet/
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/lexnames  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/data.verb  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/index.adv  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/index.verb  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/data.adj  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/index.adj  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: ../../usr/local/share/nltk_data/corpora/wordnet/verb.exc  
  

In [85]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt',download_dir='/usr/local/share/nltk_data')
nltk.download('wordnet',download_dir='/usr/local/share/nltk_data')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

[nltk_data] Downloading package punkt to /usr/local/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
_wnl = nltk.WordNetLemmatizer()
def normalize_word(word):
    return _wnl.lemmatize(word).lower()

def get_tokenized_lemmas(s):
    list=nltk.word_tokenize(s)
    tokenized_list=[]
    for token in list:
        tokenized_list.append(normalize_word(token))
    return tokenized_list

def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    list=[]
    for word in l:
        if word not in feature_extraction.text.ENGLISH_STOP_WORDS:
            list.append(word)
    return list

def preprocess(headlines,bodies):
  n_headlines, n_bodies =[],[]
  for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
    clean_headline = get_tokenized_lemmas(clean(headline))
    clean_body = get_tokenized_lemmas(clean(body))
    clean_headline = remove_stopwords(clean_headline)
    clean_body = remove_stopwords(clean_body)
    n_headlines.append(clean_headline)
    n_bodies.append(clean_body)
  n_headlines_df=pd.DataFrame({'Headline':n_headlines})
  n_bodies_df=pd.DataFrame({'Body':n_bodies})
  return n_headlines_df['Headline'].apply(lambda x:' '.join(x)), n_bodies_df['Body'].apply(lambda x:' '.join(x))


In [86]:
def statistical_features(dataset_loc):
  # we always train the model based on the train data
  df = pd.read_csv('/kaggle/working/train_Set.csv')
  dataset = pd.read_csv(dataset_loc)
  stop_words_l=stopwords.words('english')
  headlines = dataset['Headline']
  bodies = dataset['Body']
  headlines,bodies = preprocess(headlines,bodies)
  df['Headline'], df['Body'] = preprocess(df['Headline'],df['Body'])
  #1-gram TF for headlines
  headline_vectorizer = TfidfVectorizer(stop_words=stop_words_l)
  #training of model for headlines
  h1 = headline_vectorizer.fit(df['Headline'])
  h = h1.transform(headlines)
  # 1-gram TF for bodies
  body_vectorizer = TfidfVectorizer(stop_words=stop_words_l,max_features=10000-h.shape[1])
  #training of model for bodies
  b1 = body_vectorizer.fit(df['Body'])
  b = b1.transform(bodies)
  statistical_features = np.concatenate((np.array(h.toarray()),np.array(b.toarray())),axis = 1)
  return statistical_features


In [87]:
statistical_features_train = statistical_features('/kaggle/working/train_Set.csv')
statistical_features_test = statistical_features('/kaggle/working/test_Set.csv')

49972it [05:49, 142.84it/s]
49972it [05:45, 144.43it/s]
25413it [02:45, 153.10it/s]
49972it [05:48, 143.54it/s]


In [88]:
print(np.count_nonzero(statistical_features_train[500]))
print(np.count_nonzero(statistical_features_test[0]))

239
119


In [90]:
statistical_features_test.shape

(25413, 10000)

In [91]:
np.save(arr=statistical_features_test,file='/kaggle/working/test_statistical_features.npy')
np.save(arr=statistical_features_train,file='/kaggle/working/train_statistical_features.npy')