# Text mining & Search Project

### Università degli Studi di Milano-Bicocca  2020/2021

**Luzzi Federico** 816753 **Peracchi Marco** 800578

# Text Processing & Representation

In questa fase del progetto vengono applicate le fasi del text processing, come la tokenization, e molte altre, al fine di permettere la fase successiva di text representation.

In [1]:
# Librerie base
import nltk
import pandas as pd
import numpy as np
import re
import os
import string
import matplotlib.pyplot as plt
import sklearn
from wordcloud import WordCloud

In [2]:
# Librerie per la text tokenization
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import  WordPunctTokenizer
from nltk.tokenize import  BlanklineTokenizer

In [3]:
# Librerie per stemming e lemmatization
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
# Librerie per text representation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz # save and load spase matrix
from gensim.models.doc2vec import Doc2Vec, TaggedDocument # doc2vec

In [5]:
# Download dei contenuti necessari
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fede9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\fede9\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fede9\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Lettura del dataset
df = pd.read_csv("data/labeled_data.csv", sep = ',').drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [7]:
# Testo esempio
text = df["tweet"][19999]
text

"RT @shakiraevanss: Criticize Amanda for saying the n word, sure, but don't make jokes about her sexual assault, don't be trash."

### Preprocessing

In [8]:
def preprocessing(text):
    text = text.lower() # Lowering case
    remove_url = re.sub(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', ' ', text) # Removing url
    remove_retweet = re.sub(r"@\w+", " ",remove_url) # Removing retweet
    remove_retweet = re.sub(r"&\w+", " ",remove_retweet) # Remove &amp
    remove_retweet = re.sub(r"\b([!#\$%&\\\(\)\*\+,-\./:;<=>\?@\[\]\^_`\{|\}\"~]+)\b", " ",remove_retweet) # Must check this one
    remove_retweet = re.sub(r"([a-z])\1{3,}", r"\1",remove_retweet)
    remove_punc = remove_retweet.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    final_text = re.sub(r'\d+', ' ', remove_punc) # Remove number 
    final_text = re.sub(r'\s+', ' ', final_text) # Removing exceeding spaces
    return final_text

In [9]:
text_prep = preprocessing(text)
text_prep

'rt criticize amanda for saying the n word sure but dont make jokes about her sexual assault dont be trash'

### Tokenization

In [10]:
def tokenization(text_clean, tok = "tweet"):
    if tok == "tweet": # TweetTokenizer
        tt = TweetTokenizer()
        tokenized_text = tt.tokenize(text_clean)
    elif tok == "wordpunct": # WordPunctTokenizer
        wpt = WordPunctTokenizer()
        tokenized_text = wpt.tokenize(text_clean)
    return tokenized_text

In [11]:
text_tok = tokenization(text_prep)
print(text_tok)

['rt', 'criticize', 'amanda', 'for', 'saying', 'the', 'n', 'word', 'sure', 'but', 'dont', 'make', 'jokes', 'about', 'her', 'sexual', 'assault', 'dont', 'be', 'trash']


### Removing stopwords

In [12]:
def remove_stopwords(tokenized_text):
    remove_sw = []
    for token in tokenized_text:
        stop_words.append("rt") # Added a stop words, RT of Retweet
        if token.lower() not in stop_words:
             remove_sw.append(token)
    return remove_sw

In [13]:
text_sw = remove_stopwords(text_tok)
print(text_sw)

['criticize', 'amanda', 'saying', 'n', 'word', 'sure', 'dont', 'make', 'jokes', 'sexual', 'assault', 'dont', 'trash']


### Stemming

In [14]:
def stemmer(tokenized_text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in tokenized_text]

In [15]:
print(stemmer(text_sw))

['critic', 'amanda', 'say', 'n', 'word', 'sure', 'dont', 'make', 'joke', 'sexual', 'assault', 'dont', 'trash']


### Lemmatization

In [16]:
# pos-tagging (1 document)
def pos_tagging(doc_token):
    return nltk.pos_tag(doc_token)

# convertion of pos tagging
def get_wordnet_pos(word_tag):
    if word_tag.startswith('J'):
        return "a"
    elif word_tag.startswith('V'):
        return "v"
    elif word_tag.startswith('R'):
        return "r"
    else:
        return "n"
    
# lemmatizer one word 
def lemmatizer(word):
    pos = get_wordnet_pos(word[1])
    wnl = WordNetLemmatizer()
    return wnl.lemmatize(word[0], pos = pos)

# lemmatizer one document
def lemmatizer_doc(doc_token):
    lemmas = [] 
    
    pos_document = pos_tagging(doc_token) # pos tagging
    for token in pos_document:
        lemmas.append( lemmatizer(token) ) # lemmatization x word
    
    return lemmas

In [17]:
print(lemmatizer_doc(text_sw))

['criticize', 'amanda', 'say', 'n', 'word', 'sure', 'dont', 'make', 'joke', 'sexual', 'assault', 'dont', 'trash']


### Processing everything on dataset
The following cell applies:

1. Preprocessing
2. Tokenization
3. Stopwords removal
4. Lemmatization
5. Save the result

In [18]:
# Sum up function
def processing(text):
    text_prep = preprocessing(text)
    text_prep = tokenization(text_prep)
    text_prep = remove_stopwords(text_prep)
    #text_prep = lemmatizer_doc(text_prep)
    text_prep = stemmer(text_prep)
    text_prep = " ".join(text_prep)
    return text_prep

In [19]:
# Apply on all text
df["tweet_clean"] = df["tweet"].apply(lambda x : processing(x))

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
# Salvataggio del dataset
df.to_csv("data/processed_data.csv", index = False)

# Text Representation

In [6]:
# Lettura dataset
if 'processed_data.csv' not in os.listdir('data'):
    print("!!! ERROR !!!\n --- Before this, previous cells ---")
else:
    df = pd.read_csv("data/processed_data.csv", sep = ",")
    print("Load & preprocessing --- DONE")

Load & preprocessing --- DONE


In [7]:
# drop tweet list na !!!
df["tweet_list"] = df["tweet_clean"].str.split(" ").tolist()
df.dropna(inplace = True)
df = df.reset_index(drop = True)# drop 2 NA because tweet is empty

In [8]:
corpus = df["tweet_clean"]

### Bag of words

In [23]:
# Using CountVectorizer
vectorizer = CountVectorizer(binary = True)
X = vectorizer.fit_transform(corpus)

# Shape
X.toarray().shape

(24781, 15163)

In [28]:
# Example of presence of a word
X.toarray()[1][X.toarray()[1] == 1]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [30]:
# save bag of words
save_npz('data/representations/bag_of_words.npz', X)

In [31]:
# load bag of words
bag_of_words = load_npz('data/representations/bag_of_words.npz')

###  Count Vector

In [32]:
# Using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

X.toarray()[1][X.toarray()[1] == 2]

array([2], dtype=int64)

In [33]:
X.toarray()[1][X.toarray()[1] == 2]

array([2], dtype=int64)

In [34]:
# save count Vector
save_npz('data/representations/count_vector.npz', X)

### Tf-idf

In [9]:
vectorizer = TfidfVectorizer(
    use_idf=True,
    max_features=10000,
    min_df=5,
    max_df=0.501,
    ngram_range=(1, 3) #Il miglire rimane uni-gram, pessimo score del bi-gram
    )
tfidf = vectorizer.fit_transform(corpus).toarray()

In [10]:
tfidf.shape

(24781, 7123)

In [27]:
np.save('data/representations/tf-idf.npy', tfidf)

### Doc2Vec

In [38]:
# generate model of vectorization
if 'd2v.model' not in os.listdir('models/doc2vec'):
    documents = [TaggedDocument (doc, [i]) for i, doc in enumerate(df["tweet_list"])]
    model = Doc2Vec (documents, vector_size=100, window=10, min_count=1)
    model.save("models/doc2vec/d2v.model")
else:
    model = Doc2Vec.load("models/doc2vec/d2v.model")
print("Doc2Vec model --- DONE")

Doc2Vec model --- DONE


In [39]:
### vectorization
doc2vec = df["tweet_list"].apply(lambda x: model.infer_vector(x))
print("Vectorization --- DONE")

Vectorization --- DONE


In [40]:
doc2vec[0]

array([-4.9004420e-03, -1.7347162e-03,  6.9215866e-03,  4.6263197e-03,
       -6.7830500e-03,  1.3655841e-03, -5.8941529e-03, -8.1435153e-03,
       -5.3917747e-03,  1.5887000e-03, -2.2773696e-04,  1.3206439e-03,
        5.8713875e-04, -1.7054564e-03,  8.7381812e-04, -3.1240631e-03,
       -3.8415045e-03,  2.9191163e-03, -1.5092418e-03, -7.8285839e-03,
        3.4736660e-03,  6.6849748e-03, -1.9909844e-03, -1.4624494e-03,
        3.3446348e-03, -7.5983605e-03,  9.5724489e-04,  2.1171691e-03,
       -7.5575634e-04,  1.3996924e-03, -7.6049664e-03, -7.0691262e-03,
        4.9065002e-03,  4.3359231e-03, -2.2506027e-03, -8.1379776e-04,
       -3.0043148e-03, -3.5546024e-04, -5.4995599e-04,  1.5977654e-03,
       -9.8587375e-04, -2.0502885e-03, -2.7268273e-03,  1.0535977e-03,
        6.1529409e-03, -1.3617886e-03, -6.3393370e-04,  2.1928351e-03,
       -5.2184942e-03,  5.7538841e-03,  6.3594310e-03, -2.9295406e-03,
        1.0502904e-03, -5.6401920e-03, -1.7953069e-03, -6.1973687e-03,
      

In [41]:
np.save('data/representations/doc2vec.npy', doc2vec)