In [3]:
import pandas as pd
import re
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import remove_stopwords, strip_short, stem_text
import pickle
import spacy
from transformers import BertTokenizer
import nltk
from nltk.corpus import stopwords

In [4]:
# Path to the downloaded nltk file for stop words

nltk.data.path.append('/Users/muhammadraza/Documents/GitHub/BIPM/Text Analytics Lab/stopwords')


In [35]:
# Reading in the json file

df = pd.read_json('/Users/muhammadraza/Documents/GitHub/BIPM/Text Analytics Lab/newsgroups.json')

In [6]:
# Exploring df

df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [7]:
# Exploring sample content

print(df['content'][1])

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>



In [8]:
# Regex function to remove the metadata lines such as from and subject

def preprocess_text(text):

    pattern = r'^(From:|Article-I.D.:|Organization:|Lines:|NNTP-Posting-Host:|Distribution:|Reply-To:|XNewsreader:|Expires:|\s+-{1,}|Subject:|Summary:|Keywords:).*$'
    clean_text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)
    return clean_text.strip()

In [9]:
# Making a new object and applying the function

data = df['content'].apply(preprocess_text)

In [10]:
# Stripping numbers

data = data.apply(strip_numeric, strip_punctuation)

  data = data.apply(strip_numeric, strip_punctuation)


In [11]:
# Stripping punctuations

data = data.apply(strip_punctuation)

In [12]:
# Stripping potential multiple whitespaces

data = data.apply(strip_multiple_whitespaces)

In [13]:
data = data.apply(lambda x: x.lower())

These functions applied remove the noise from the text as numbers, punctuations, and unnecassary whitespaces are not required for this exercise. We use lowercases for all words as they would then be consistently tokenized.

In [14]:
# Stopwords removal

# Gensim stopwords

print(sorted(STOPWORDS))

# nltk stopwords are stored as a document in the wd

file_path = '/Users/muhammadraza/Documents/GitHub/BIPM/Text Analytics Lab/stopwords/english'

with open(file_path, 'r') as file:
    # Read the content of the file
    nltk_stopwords = file.read().splitlines()

print(sorted(nltk_stopwords))

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'foun

In [15]:
# Removing negations
# Including negations in stopwords is reasonable because they often do not provide semantic meaning in many text analysis tasks

removed_words = ["aren't",'isn', "isn't",'mightn', "mightn't",'mustn', "mustn't",'needn',
                                        "needn't",'no', 'nor', 'not',"shan't",'shouldn', "shouldn't",'wasn',
                                        "wasn't","weren't",'wouldn', "wouldn't"]

for word in removed_words:
    if word in nltk_stopwords:
        nltk_stopwords.remove(word)

print(sorted(nltk_stopwords))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'more', 'most', 'my', 'myself', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'she', "she's", 'should', "should've", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 't

In [16]:
# Removing words from the main object 'data'

data = data.apply(lambda x: remove_stopwords(x, stopwords = nltk_stopwords))

In [17]:
# There could still be words in there less than 3 letters which are not covered in the stopwords list i.e. typos

data = data.apply(strip_short)

In [18]:
# I apply stemming to get words in their root form:

data_stem = data.apply(stem_text)

In [19]:
# I apply lemmatization with Spacy as well to get words in their linguistic roots for easier processing

import spacy # I have downloaded the en_core_web_sm already

nlp = spacy.load('en_core_web_sm')

data_lem = []

for doc in data:

    # Applying en_core_web_sm to each document in data
    nlp_doc = nlp(doc)

    # Generating lemma for each token and joining them all together with a space
    lemmas = [token.lemma_ for token in nlp_doc]
    lemmas_joined = " ".join(lemmas)

    # Appending the joined ser
    data_lem.append(lemmas_joined)


In [25]:
# BERT Tokenizer: we want each word to become a token to enable processing

# Loading the tokenizer

bert_uncased = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [31]:
# This will break down each document in data to tokens

data_BERT = data.apply(bert_uncased.tokenize)

In [34]:
# Preview

print(data_BERT[0])

['wondering', 'anyone', 'could', 'en', '##light', '##en', 'car', 'saw', 'day', 'door', 'sports', 'car', 'looked', 'late', 'early', 'called', 'brick', '##lin', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tell', '##me', 'model', 'name', 'engine', 'spec', '##s', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'mail', 'thanks']


In [39]:
with open('data_clean.pkl', 'wb') as handle:
    pickle.dump(data, handle)

with open('Stemmed.pkl', 'wb') as handle:
    pickle.dump(data_stem, handle)

with open('Lemma.pkl', 'wb') as handle:
    pickle.dump(data_lem, handle)