In [1]:
import re
import emoji
import nltk
from nltk.tag import pos_tag, map_tag
import pandas as pd

In [2]:
df = pd.read_csv("../dataset/raw_dataset.csv")
df.head()

Unnamed: 0,screen_name,date,text,retweet_count,ratio
0,CookGlobal,2020-01-23 21:15:09,"Creamy #vegan dressings are delicious, lower i...",6,0.001649
1,CookGlobal,2020-01-23 19:15:07,"It’s almost the weekend! So, it’s the perfect...",8,0.002199
2,CookGlobal,2020-01-23 17:15:02,My first experiment with homemade #vegan “chee...,0,0.0
3,CookGlobal,2020-01-23 15:15:04,"The weekend is almost here! 🙌🏼. So, it’s the ...",16,0.002932
4,CookGlobal,2020-01-23 14:15:10,Easy Peanut Butter Cookies make such a tasty t...,7,0.002566


# simple features extraction

In [3]:
chars, question_marks, esclamation_marks, emojis, hashtags, tags, urls, pos_count  = [], [], [], [], [], [], [], []

for index, row in df.iterrows():
    tweet = row.text
    tokens = nltk.word_tokenize(tweet)
    nr_question_marks, nr_esclamation_marks = 0, 0

    # question and esclamation marks
    for token in tokens:
        if token == '?':
            nr_question_marks += 1
        if token == '!':
            nr_esclamation_marks += 1

    # emoji
    allchars = [str for str in tweet]
    lista = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    nr_emoji = len(lista)
    for c in tweet:
        if c in lista:
            tweet = tweet.replace(c, "")

    # hashtags
    try:
        nr_hashtags = len(re.findall(r"#(\w+)", tweet))
        tweet = re.sub(r"#(\w+)", "", tweet, count=nr_hashtags)
    except Exception:
        nr_hashtags = 0


    # tags
    try:
        nr_tags = len(re.findall(r" @(\w+)", tweet))
        tweet = re.sub(r" @(\w+)", "", tweet, count=nr_tags)
    except Exception:
        nr_tags = 0

    # urls
    try:
        nr_urls = len(re.findall(r"http[s]?://([a-zA-Z0-9/.]+)", tweet))
        tweet = re.sub(r"http[s]?://([a-zA-Z0-9/.]+)", "", tweet, count=nr_urls)
    except Exception:
        nr_urls = 0
        
    # special characters
    tweet = re.sub('[^A-Za-z0-9 ]+', '', tweet)
    
    #POS TAGGING
    _UNIVERSAL_TAGS = (
    "VERB",
    "NOUN",
    "ADJ",
    "ADV",
    )
    
    pos = nltk.pos_tag(tweet.split())
    simplifiedPos = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos]
    
    universal_tags_list = list(_UNIVERSAL_TAGS)
    count = []
    for i in range(0, len(universal_tags_list)):
        count.append(0)
    
    for element in simplifiedPos:
        for index, target in enumerate(universal_tags_list):
            if target==element[1]:
                count[index] += 1
    pos_count.append(count)
    
    # no spaces
    try:
        tweet = re.sub(" ", "", tweet, len(re.findall(" ", tweet)))
    except Exception:
        "Error: spaces have not be deleted from the tweet."
    
    chars.append(len(tweet))
    question_marks.append(nr_question_marks)
    esclamation_marks.append(nr_esclamation_marks)
    emojis.append(nr_emoji)
    hashtags.append(nr_hashtags)
    tags.append(nr_tags)
    urls.append(nr_urls)

df['plain_text_len'] = chars
df['question_marks'] = question_marks
df['esclamation_marks'] = esclamation_marks
df['emojis'] = emojis
df['hashtags'] = hashtags
df['tags'] = tags
df['urls'] = urls

## Features refactoring

In [4]:
df1 = pd.DataFrame(pos_count, columns=universal_tags_list)
df.drop(columns=["ratio", "date", "text"], inplace=True)

result = pd.concat([df, df1], axis=1, sort=False)

#rename columns

result.rename(columns = {'VERB': 'verbs', 'NOUN': 'nouns', 'ADJ' : 'adjs', 'ADV' : 'advs'}, inplace = True)

#save results
result.to_csv("../dataset/dataset.csv", index=False)