In [1]:
import pandas as pd
import re
import nltk
import emoji

In [2]:
df = pd.read_csv("../dataset/raw_dataset.csv")
df.head()

Unnamed: 0,screen_name,date,text,retweet_count,favourites_count,ratio
0,Orietta's Recipes,2020-01-23 18:13:47,La mia cena! Mozzarella in carrozza 🍷🍷☺️. Voi ...,19,95,0.014381
1,Orietta's Recipes,2020-01-23 11:42:55,Buon pranzo a tutti! 😋 Oggi spatzle con filett...,14,79,0.011959
2,Orietta's Recipes,2020-01-23 07:55:06,"Eccomi con la mia😍 NUOVA 😍 ricetta, per colazi...",25,98,0.014835
3,Orietta's Recipes,2020-01-22 17:59:47,È quasi ora di cena! Io ho un po' di fame ☺️☺...,24,106,0.016046
4,Orietta's Recipes,2020-01-22 11:30:14,Arriva l'ora di pranzo e viene voglia di un pi...,26,108,0.016349


# simple features extraction

In [3]:
chars, question_marks, esclamation_marks, emojis, hashtags, tags, urls, consecutive_chars, plain_text  = [], [], [], [], [], [], [], [], []

for index, row in df.iterrows():
    tweet = row.text
    tokens = nltk.word_tokenize(tweet)
    nr_question_marks, nr_esclamation_marks = 0, 0

    # question and esclamation marks
    for token in tokens:
        if token == '?':
            nr_question_marks += 1
        if token == '!':
            nr_esclamation_marks += 1

    # emoji
    allchars = [str for str in tweet]
    lista = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    nr_emoji = len(lista)
    for c in tweet:
        if c in lista:
            tweet = tweet.replace(c, "")

    # hashtags
    try:
        nr_hashtags = len(re.findall(r"#(\w+)", tweet))
        tweet = re.sub(r"#(\w+)", "", tweet, count=nr_hashtags)
    except Exception:
        nr_hashtags = 0


    # tags
    try:
        nr_tags = len(re.findall(r" @(\w+)", tweet))
        tweet = re.sub(r" @(\w+)", "", tweet, count=nr_tags)
    except Exception:
        nr_tags = 0

    # urls
    try:
        nr_urls = len(re.findall(r"http[s]?://([a-zA-Z0-9/.]+)", tweet))
        tweet = re.sub(r"http[s]?://([a-zA-Z0-9/.]+)", "", tweet, count=nr_urls)
    except Exception:
        nr_urls = 0

    # consecutive chars
    try:
        nr_consecutive_chars = len(re.findall(r"(A-Za-z){3,}", tweet))
        tweet = re.sub(r"http[s]?://([a-zA-Z0-9/.]+)", "", tweet, count=nr_urls)
    except Exception:
        nr_consecutive_chars = 0
        
    # special characters
    tweet = re.sub('[^A-Za-z0-9 ]+', '', tweet)
    plain_text.append(tweet)    
    
    # no spaces
    try:
        tweet = re.sub(" ", "", tweet, len(re.findall(" ", tweet)))
    except Exception:
        "Error: spaces have not be deleted from the tweet."
    
    chars.append(len(tweet))
    question_marks.append(nr_question_marks)
    esclamation_marks.append(nr_esclamation_marks)
    emojis.append(nr_emoji)
    hashtags.append(nr_hashtags)
    tags.append(nr_tags)
    urls.append(nr_urls)
    consecutive_chars.append(nr_consecutive_chars)

df['len_plain_text'] = chars
df['question_marks'] = question_marks
df['esclamation_marks'] = esclamation_marks
df['emojis'] = emojis
df['hashtags'] = hashtags
df['tags'] = tags
df['urls'] = urls
df['consecutive_chars'] = consecutive_chars
df['plain_text'] = plain_text

## TF-IDF SUM FOR EACH TWEET 
- #### CORPUS = set of tweets of the current page
- #### tf-idf calulated on plain text

### create the corpus for each page of the dataset

In [4]:
names = df.screen_name.unique()

corpus_list = []

for name in names:
    temp = df[df.screen_name == name]
    corpus = []
    for index, row in temp.iterrows():
        corpus.append(row.plain_text)
    corpus_list.append(corpus)

### N-gram (1,2) tokenization and tf-idf calculation for each word in the corpus

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1, 2))

tf_idf_sum = []

for corpus in corpus_list:
    tfidf_vectors_list = vectorizer.fit_transform(corpus)
    
    for element in tfidf_vectors_list:
        tf_idf_tweet = round(float(sum(element.T.todense())), 8)
        tf_idf_sum.append(tf_idf_tweet)

In [6]:
len(tf_idf_sum)

373

In [7]:
df['tf-idf_sum'] = tf_idf_sum
df = df.drop(columns="plain_text")
df.to_csv("../dataset/dataset.csv")