Data clean for adding emoji convert to text back to text

In [8]:
import pandas as pd
import re
import emoji
import nltk
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter
from textblob import TextBlob

In [9]:
# download the necessary resources
nltk.download('stopwords')

# set up the necessary resources
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dylan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def clean_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)

    # clean _
    text = re.sub(r'_', ' ', text)

    # clean multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # clean numbers  
    text = re.sub(r'\d+', '', text)  
    return text

In [11]:
# correct the spelling
def correct_spelling(text):
    return str(TextBlob(text).correct())

# remove the stop words and stem the words
def preprocess_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [12]:
def calculate_term_frequency(text):
    words = text.split()
    term_freq = Counter(words)
    return term_freq

In [14]:
# load the data
data = pd.read_csv('../dataset/process/tweets_convert_cleaned_emoticons_emojis.tsv', sep='\t')

data['text'] = data['text'].astype(str)

print('Cleaning the text...')
tqdm.pandas(desc="Cleaning the text")
data['text'] = data['text'].progress_apply(clean_punctuation)

# save the data
data.to_csv('../dataset/process/tweets_convert_cleaned.tsv', sep='\t', index=False)

print('Preprocessing the text...')
tqdm.pandas(desc="Preprocessing the text")
data['text'] = data['text'].progress_apply(preprocess_text)


Cleaning the text...


Cleaning the text: 100%|██████████| 124498/124498 [00:05<00:00, 23416.53it/s]


Preprocessing the text...


Preprocessing the text: 100%|██████████| 124498/124498 [00:36<00:00, 3389.69it/s]


In [15]:
print(data['text'])

0                                              hell snowbal
1                                           bus rock n roll
2                                                game chang
3                         yo vine hes fuckifn cute hate kfe
4         hate wear cloth like wanna walk around shirt wear
                                ...                        
124493    sentinel editori fbis comey one middl class fa...
124494             perfect pussi clip hudgen zac efron nake
124495                                protest rise altright
124496    tri convers dad vegetarian pointless infuri th...
124497                 stand guy gentleman vice presid penc
Name: text, Length: 124498, dtype: object


In [16]:
def remove_redundant_words(text):
    words = text.split()
    word_counts = Counter(words)
    reduced_words = []
    for word in words:
        if word_counts[word] > 1:
            word_counts[word] -= 1
        else:
            reduced_words.append(word)
    return ' '.join(reduced_words)

In [17]:
data['reduced_clean_text'] = data['text'].apply(remove_redundant_words)

# calculate the term frequency
data['term_frequency'] = data['text'].apply(calculate_term_frequency)

# print the maximum length text
max_length_text = data['text'].apply(lambda x: len(x.split())).max()
print(f"The maximum length text is {max_length_text}")


The maximum length text is 29


In [18]:
# save the data
data.to_csv('../dataset/process/tweets_convert_cleaned_reduced.tsv', sep='\t', index=False)

In [21]:
import scipy.sparse

In [22]:
# initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=10000)

# fit the vectorizer
print('Fitting the vectorizer...')
vectorizer.fit(data['reduced_clean_text'])

# transform the text
print('Transforming the text...')
X = vectorizer.transform(data['text'])

# save the data
scipy.sparse.save_npz('../dataset/process_data/tfidf_convert_sparse.npz', X)

print(X)

Fitting the vectorizer...
Transforming the text...
  (0, 3994)	0.5160875055107653
  (0, 8132)	0.856535864196985
  (1, 1248)	0.638266447424805
  (1, 7453)	0.5406237909824336
  (1, 7472)	0.5480345415350208
  (2, 1477)	0.7829713516878458
  (2, 3474)	0.6220577645493284
  (3, 2090)	0.44278689382597475
  (3, 3917)	0.3869059261478409
  (3, 4017)	0.3825876931210054
  (3, 9443)	0.5417048760413472
  (3, 9930)	0.4630616107096361
  (4, 431)	0.2953073795144761
  (4, 1669)	0.37555085847053143
  (4, 3917)	0.27602557009293915
  (4, 5114)	0.1864097313364237
  (4, 7912)	0.3366639879070339
  (4, 9526)	0.3017828784900291
  (4, 9543)	0.27942729649560083
  (4, 9600)	0.6150782393275305
  (5, 1030)	0.7208531114618266
  (5, 5408)	0.6930878672259408
  (6, 2940)	1.0
  (7, 2271)	0.5927784732873538
  (7, 5114)	0.2679292098632014
  :	:
  (124493, 5577)	0.4300348744252286
  (124493, 6222)	0.24143918888678192
  (124494, 1660)	0.393336640101101
  (124494, 2724)	0.37240878016285267
  (124494, 4190)	0.45525383723220303
