# Text mining & Search Project

### Università degli Studi di Milano-Bicocca  2020/2021

**Luzzi Federico** (matricola) **Peracchi Marco** 800578

# Text Processing & Representation

In questa fase del progetto vengono applicate le fasi del text processing, come la tokenization, e molte altre, al fine di permettere la fase successiva di text representation.

In [1]:
# Librerie base
import nltk
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import sklearn
from wordcloud import WordCloud

In [2]:
# Librerie per la text tokenization
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import  WordPunctTokenizer
from nltk.tokenize import  BlanklineTokenizer

In [3]:
# Librerie per stemming e lemmatization
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [134]:
# Librerie per text representation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Download dei contenuti necessari
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Marco\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marco\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [130]:
# Lettura del dataset
df = pd.read_csv("data/labeled_data.csv", sep = ',').drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [97]:
# Testo esempio
text = df["tweet"][19999]
text

"RT @shakiraevanss: Criticize Amanda for saying the n word, sure, but don't make jokes about her sexual assault, don't be trash."

### Preprocessing

In [110]:
def preprocessing(text):
    text = text.lower() # Lowering case
    remove_url = re.sub(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', ' ', text) # Removing url
    remove_retweet = re.sub(r"@\w+", " ",remove_url) # Removing retweet
    remove_retweet = re.sub(r"&\w+", " ",remove_retweet) # Remove &amp
    remove_retweet = re.sub(r"\b([!#\$%&\\\(\)\*\+,-\./:;<=>\?@\[\]\^_`\{|\}\"~]+)\b", " ",remove_retweet) # Must check this one
    remove_retweet = re.sub(r"([a-z])\1{3,}", r"\1",remove_retweet)
    remove_punc = remove_retweet.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    final_text = re.sub(r'\d+', ' ', remove_punc) # Remove number 
    final_text = re.sub(r'\s+', ' ', final_text) # Removing exceeding spaces
    return final_text

In [113]:
text_prep = preprocessing(text)
text_prep

'rt criticize amanda for saying the n word sure but dont make jokes about her sexual assault dont be trash'

### Tokenization

In [114]:
def tokenization(text_clean, tok = "tweet"):
    if tok == "tweet": # TweetTokenizer
        tt = TweetTokenizer()
        tokenized_text = tt.tokenize(text_clean)
    elif tok == "wordpunct": # WordPunctTokenizer
        wpt = WordPunctTokenizer()
        tokenized_text = wpt.tokenize(text_clean)
    return tokenized_text

In [118]:
text_tok = tokenization(text_prep)
print(text_tok)

['rt', 'criticize', 'amanda', 'for', 'saying', 'the', 'n', 'word', 'sure', 'but', 'dont', 'make', 'jokes', 'about', 'her', 'sexual', 'assault', 'dont', 'be', 'trash']


### Removing stopwords

In [117]:
def remove_stopwords(tokenized_text):
    remove_sw = []
    for token in tokenized_text:
        stop_words.append("rt") # Added a stop words, RT of Retweet
        if token.lower() not in stop_words:
             remove_sw.append(token)
    return remove_sw

In [120]:
text_sw = remove_stopwords(text_tok)
print(text_sw)

['criticize', 'amanda', 'saying', 'n', 'word', 'sure', 'dont', 'make', 'jokes', 'sexual', 'assault', 'dont', 'trash']


### Stemming

In [121]:
def stemmer(tokenized_text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in tokenized_text]

In [123]:
print(stemmer(text_sw))

['critic', 'amanda', 'say', 'n', 'word', 'sure', 'dont', 'make', 'joke', 'sexual', 'assault', 'dont', 'trash']


### Lemmatization
## RICONTROLLARE

In [124]:
# pos-tagging (1 document)
def pos_tagging(doc_token):
    return nltk.pos_tag(doc_token)

# convertion of pos tagging
def get_wordnet_pos(word_tag):
    if word_tag.startswith('J'):
        return "a"
    elif word_tag.startswith('V'):
        return "v"
    elif word_tag.startswith('R'):
        return "r"
    else:
        return "n"
    
# lemmatizer one word 
def lemmatizer(word):
    pos = get_wordnet_pos(word[1])
    wnl = WordNetLemmatizer()
    return wnl.lemmatize(word[0], pos = pos)

# lemmatizer one document
def lemmatizer_doc(doc_token):
    lemmas = [] 
    
    pos_document = pos_tagging(doc_token) # pos tagging
    for token in pos_document:
        lemmas.append( lemmatizer(token) ) # lemmatization x word
    
    return lemmas

In [125]:
print(lemmatizer_doc(text_sw))

['criticize', 'amanda', 'say', 'n', 'word', 'sure', 'dont', 'make', 'joke', 'sexual', 'assault', 'dont', 'trash']


In [127]:
# Sum up function
def processing(text):
    text_prep = preprocessing(text)
    text_prep = tokenization(text_prep)
    text_prep = remove_stopwords(text_prep)
    text_prep = lemmatizer_doc(text_prep)
    #text_prep = stemmer(text_prep)
    text_prep = " ".join(text_prep)
    return text_prep

In [173]:
# Apply on all text
df_red = df[1:10000]
df_red["tweet_clean"] = df_red["tweet"].apply(lambda x : processing(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Bag of words

In [174]:
corpus = df_red["tweet_clean"]

In [175]:
# Using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Shape
X.toarray().shape

(9999, 9980)

In [176]:
# Example of presence of a word
X.toarray()[1][X.toarray()[1] == 1]

array([1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

###  Count Vector

In [177]:
# Using CountVectorizer
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(corpus)

X.toarray()[1][X.toarray()[1] == 2]

array([], dtype=int64)