In [1]:
import os

import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

from InputDataset import FramingArticleDataset

  from .autonotebook import tqdm as notebook_tqdm


# Test tokenizer, lemmatization, stopword and punctuation removal with Spacy

In [2]:
en_nlp = spacy.load('en_core_web_sm')

In [3]:
sem_eval_train = FramingArticleDataset(data_dir='data', language='en', subtask=2, split='train')
sem_eval_train.separate_title_content(remove_raw_data_col=False)

433it [00:00, 35075.39it/s]


## Generate TF-IDF features

In [4]:
lemmatize_rm_punct_and_stop = lambda str: [token.lemma_ for token in en_nlp(str) if not token.is_punct and not token.is_stop]

In [5]:
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lemmatize_rm_punct_and_stop, analyzer='word', max_df=0.95, min_df=5, max_features=None, use_idf=True, smooth_idf=True, sublinear_tf=False)

In [6]:
%timeit tf_idf_vectorizer = TfidfVectorizer(tokenizer=lemmatize_rm_punct_and_stop, analyzer='word', max_df=1, min_df=1, max_features=None, use_idf=True, smooth_idf=True, sublinear_tf=False)

1.6 µs ± 37.5 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [7]:
tf_idf_vectorizer.fit(sem_eval_train.df.raw_text.to_list())



In [8]:
len(tf_idf_vectorizer.get_feature_names())



4348

In [9]:
tf_idf_vectorizer.get_feature_names()

['\n\n\n',
 '\n\n\n\n',
 '\n\n\n\n\n',
 ' ',
 '$',
 '+',
 '1',
 '1,000',
 '1.2',
 '10',
 '10,000',
 '100',
 '100,000',
 '11',
 '11th',
 '12',
 '120',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1960',
 '1961',
 '1965',
 '1976',
 '1979',
 '1980',
 '1982',
 '1983',
 '1990',
 '1990s',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '19th',
 '1st',
 '2',
 '2,000',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '21',
 '22',
 '23',
 '24',
 '24/7',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '300',
 '31',
 '32',
 '32nd',
 '33',
 '34',
 '35',
 '36',
 '38',
 '39',
 '3d',
 '4',
 '40',
 '45',
 '46',
 '47',
 '48',
 '49',
 '5',
 '5,000',
 '50',
 '50,000',
 '500',
 '51',
 '53',
 '54',
 '55',
 '58',
 '5th',
 '6',
 '60',
 '600',
 '66',
 '7',
 '70',
 '700',
 '73',
 '75',
 '8',
 '80',
 '800',
 '84',
 '9',
 '9/11',
 '90',
 '91',
 '

In [10]:
tf_idf_vectorizer.transform(sem_eval_train.df.raw_text.iloc[10:12])

<2x4348 sparse matrix of type '<class 'numpy.float64'>'
	with 415 stored elements in Compressed Sparse Row format>