In [44]:
import pandas as pd
import re
import emoji
import nltk
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from textblob import TextBlob

In [45]:
# download the necessary resources
nltk.download('stopwords')

# set up the necessary resources
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dylan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
# clean the text
def clean_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [47]:
# correct the spelling
def correct_spelling(text):
    return str(TextBlob(text).correct())

# remove the stop words and stem the words
def preprocess_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [48]:
# load the data
data = pd.read_csv('../dataset/process/en-2020-01-merged-cleaned-without-emoji.tsv', sep='\t')

data['text'] = data['text'].astype(str)

print('Cleaning the text...')
tqdm.pandas(desc="Cleaning the text")
data['text'] = data['text'].progress_apply(clean_punctuation)

# print('Correcting the spelling...')
# tqdm.pandas(desc="Correcting the spelling")
# data['text'] = data['text'].progress_apply(correct_spelling)

print('Preprocessing the text...')
tqdm.pandas(desc="Preprocessing the text")
data['text'] = data['text'].progress_apply(preprocess_text)


Cleaning the text...


Cleaning the text: 100%|██████████| 129670/129670 [00:01<00:00, 90956.20it/s]


Preprocessing the text...


Preprocessing the text: 100%|██████████| 129670/129670 [00:14<00:00, 9038.96it/s] 


In [49]:
# save the data
data.to_csv('../dataset/process/en-2020-01-merged-cleaned-without-emoji-tfidf.tsv', sep='\t', index=False)

In [50]:
import scipy.sparse

In [51]:
# initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=10000)

# fit the vectorizer
print('Fitting the vectorizer...')
vectorizer.fit(data['text'])

# transform the text
print('Transforming the text...')
X = vectorizer.transform(data['text'])

# save the data
scipy.sparse.save_npz('../dataset/process/tfidf_sparse.npz', X)

print(X)

Fitting the vectorizer...
Transforming the text...
  (0, 895)	0.39324661930528254
  (0, 3437)	0.28855800864442493
  (0, 3699)	0.27147908555651307
  (0, 4994)	0.3154779615420935
  (0, 5899)	0.27224176192334815
  (0, 7934)	0.3899710457588365
  (0, 8677)	0.41145225670211
  (0, 9769)	0.3529729164331398
  (0, 9810)	0.2622752393818948
  (1, 3699)	0.2964903619249672
  (1, 5698)	0.6739142048964195
  (1, 5899)	0.2973233034076269
  (1, 8738)	0.35210313961603085
  (1, 9130)	0.40436158140381434
  (1, 9810)	0.28643856851399285
  (2, 2128)	0.8304695197580235
  (2, 3699)	0.3249432459255961
  (2, 5899)	0.3258561211613517
  (2, 9810)	0.3139268258398662
  (3, 5038)	0.38797867140990316
  (3, 5724)	0.4796125374965737
  (3, 9894)	0.7870478793612902
  (4, 305)	0.734544170740822
  (4, 5937)	0.6785608751104636
  (5, 450)	0.5357856068527261
  :	:
  (129662, 9268)	0.3397720138639089
  (129662, 9673)	0.21179485404751677
  (129663, 8088)	1.0
  (129664, 5298)	0.6429656458583077
  (129664, 6686)	0.7658950177707184
