# Librerías

In [1]:
import numpy as np
import pandas as pd

# Lectura de datos

In [2]:
data = pd.read_csv('./datasets/tweets.csv', sep=';', on_bad_lines='skip',
                   lineterminator='\n', low_memory=False)
data

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text\r
0,1.132977e+18,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0,0,0,È appena uscito un nuovo video! LES CRYPTOMONN...
1,1.132977e+18,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0,0,0,Cardano: Digitize Currencies; EOS https://t.co...
2,1.132977e+18,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0,2,1,Another Test tweet that wasn't caught in the s...
3,1.132977e+18,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0,0,0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,1.132977e+18,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0,0,0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...
...,...,...,...,...,...,...,...,...,...
16889760,1.198262e+18,JacobCanfield,Jacob Canfield,,2019-11-23 15:28:50+00,2,16,3,Happy #FibonacciDay \n\nA while back I created...
16889761,1.198266e+18,Vizique,Vizique,,2019-11-23 15:45:55+00,0,0,0,Bitcoin Suisse Certificates :) https://t.co/nd...
16889762,1.198266e+18,torusJKL,Gal Buki ($torusJKL),,2019-11-23 15:45:56+00,0,0,0,Register now for the early access of the Codug...
16889763,1.198266e+18,Adekunl95628158,Adekunle Daniel,,2019-11-23 15:45:57+00,0,0,0,@btc \n@btc \nDo you know that BTC Baskets isn...


In [3]:
data = data[['timestamp', 'text\r']]
data = data.rename(columns={'timestamp': 'date', 'text\r': 'text'})
data

Unnamed: 0,date,text
0,2019-05-27 11:49:14+00,È appena uscito un nuovo video! LES CRYPTOMONN...
1,2019-05-27 11:49:18+00,Cardano: Digitize Currencies; EOS https://t.co...
2,2019-05-27 11:49:06+00,Another Test tweet that wasn't caught in the s...
3,2019-05-27 11:49:22+00,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,2019-05-27 11:49:23+00,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...
...,...,...
16889760,2019-11-23 15:28:50+00,Happy #FibonacciDay \n\nA while back I created...
16889761,2019-11-23 15:45:55+00,Bitcoin Suisse Certificates :) https://t.co/nd...
16889762,2019-11-23 15:45:56+00,Register now for the early access of the Codug...
16889763,2019-11-23 15:45:57+00,@btc \n@btc \nDo you know that BTC Baskets isn...


# Limpieza de datos

In [4]:
data.isna().sum()

date    0
text    0
dtype: int64

In [5]:
from datetime import datetime

def str_to_datetime(d):
    return datetime.strptime(d.split(' ')[0], '%Y-%m-%d')

data.date = np.vectorize(str_to_datetime)(np.array(data.date))
data = data.sort_values(by='date')
data = data.reset_index(drop=True)

In [6]:
from datetime import timedelta

current_day = data.date.min()
min_index = 0
count_tweets = 0

# Verificar que hay mínimo 1500 publicaciones por día
for t in data.itertuples():
    index = t[0]
    d = t[1]

    if current_day == d:
        count_tweets += 1
        continue

    if (count_tweets < 1500):
        min_index = index + 1

    current_day += timedelta(days=1)
    count_tweets = 1

data = data.loc[min_index:]
data.reset_index(drop=True, inplace=True)

In [7]:
from cmath import nan

# Quedarse solo con n publicaciones por dia
def keep_n_tweets_per_day(n):
    current_date = data.date.min()
    arr_days = []
    count_tweets = 0

    for t in data.itertuples():
        index = t[0]
        d = t[1]

        if current_date == d:
            if count_tweets == n:
                arr_days.append(nan)
                continue
            count_tweets += 1
        else:
            current_date += timedelta(days=1)
            count_tweets = 1

        arr_days.append(d)

    data.date = arr_days
    data.dropna(inplace=True)
    data.reset_index(drop=True, inplace=True)

In [12]:
keep_n_tweets_per_day(1500)

In [15]:
import re
from langdetect import detect

english_texts = []

for d in data.itertuples():
    t = d[2]

    try:
        t = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', t, flags=re.MULTILINE)
        t = re.sub(r'\$\w+[,]|\@\w+|[,]\@\w+', '', t)

        t = t.replace('#', '').replace('\n', '').replace('\r', '').replace('\b', '').replace(
            '\t', '').replace('\\', ' ').replace('/', ' ').replace('|', ' ')

        emoj = re.compile("["
                  u"\U0001F600-\U0001F64F"
                  u"\U0001F300-\U0001F5FF"
                  u"\U0001F680-\U0001F6FF"
                  u"\U0001F1E0-\U0001F1FF"
                  u"\U00002500-\U00002BEF"
                  u"\U00002702-\U000027B0"
                  u"\U00002702-\U000027B0"
                  u"\U000024C2-\U0001F251"
                  u"\U0001f926-\U0001f937"
                  u"\U00010000-\U0010ffff"
                  u"\u2640-\u2642"
                  u"\u2600-\u2B55"
                  u"\u200d"
                  u"\u23cf"
                  u"\u23e9"
                  u"\u231a"
                  u"\ufe0f"
                  u"\u3030"
                  "]+", re.UNICODE)
        t = re.sub(emoj, '', t)

        t = re.sub(r'  +', ' ', t)

        if detect(t) != 'en':
            english_texts.append(None)
        else:
            english_texts.append(t)
    except:
        english_texts.append(None)

In [25]:
data.text = english_texts
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [73]:
keep_n_tweets_per_day(1000)

In [74]:
data = data.loc[:len(data.date) - 806]

In [76]:
def check_skipped_days():
    x = 0
    arr = np.array([])
    day_before = datetime.now()
    index_before = 0
    min_index = 0

    for t in data.itertuples():
        index = t[0]
        d = t[1]
        diff = 0

        if x == 0:
            day_before, index_before = d, index
            x+=1
            continue

        diff = (d - day_before).days

        if diff > 1:
            arr = np.append(arr, '{} => {} : {} days'.format(index_before, index, diff))
            min_index = index_before

        day_before, index_before = d, index

    return arr, min_index + 1

In [77]:
skipped_days, min_index = check_skipped_days()

print(skipped_days)
print(len(skipped_days))

[]
0


In [78]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def analyze_sentiments():
    a = 0
    df = pd.DataFrame()
    sentiments = []

    sia = SentimentIntensityAnalyzer()

    for t in data.text:
        x = sia.polarity_scores(t)

        sentiments.append(x['compound'])

    return sentiments

In [79]:
data['sentiment'] = analyze_sentiments()
data = data[['date', 'sentiment']]

In [102]:
data

Unnamed: 0,date,sentiment
0,2019-05-08,0.0000
1,2019-05-08,0.0000
2,2019-05-08,-0.4588
3,2019-05-08,0.5707
4,2019-05-08,0.5994
...,...,...
198995,2019-11-23,0.4574
198996,2019-11-23,0.0000
198997,2019-11-23,0.0000
198998,2019-11-23,0.0000


In [119]:
from statistics import mean


a = []
d = data.date.min()
r = []

for s in data.itertuples():
    index = s[0]
    day = s[1]
    sentiment = s[2]

    if d == day:
        a.append(sentiment)
    else:
        r.append(float("{:.7f}".format(mean(a))))
        a = []
        a.append(sentiment)
        d += timedelta(days=1)

In [120]:
r

[0.1361281,
 0.1617512,
 0.182476,
 0.1494986,
 0.1445289,
 0.1881597,
 0.1741632,
 0.1538138,
 0.1577226,
 0.1537385,
 0.2086018,
 0.1936458,
 0.1591729,
 0.1136141,
 0.2352376,
 0.1706465,
 0.1758958,
 0.1308762,
 0.205562,
 0.1763413,
 0.174298,
 0.2037124,
 0.1745199,
 0.1447803,
 0.2409776,
 0.154904,
 0.1637375,
 0.1345755,
 0.1586873,
 0.188222,
 0.0891062,
 0.1412558,
 0.1713654,
 0.1697036,
 0.1482675,
 0.1991924,
 0.1642135,
 0.1661859,
 0.1477734,
 0.2050108,
 0.1799282,
 0.1843109,
 0.1641379,
 0.130589,
 0.1565133,
 0.1600937,
 0.132519,
 0.1591704,
 0.1928406,
 0.1107853,
 0.1291252,
 0.1731792,
 0.2037547,
 0.1751833,
 0.1586709,
 0.1634739,
 0.1176576,
 0.2027865,
 0.2181054,
 0.1688147,
 0.1661405,
 0.1816076,
 0.1521992,
 0.1134756,
 0.1283268,
 0.1122794,
 0.1086885,
 0.1414786,
 0.0886737,
 0.1171355,
 0.1594303,
 0.136662,
 0.1216487,
 0.1075076,
 0.1352206,
 0.1260018,
 0.1036341,
 0.1305491,
 0.2150488,
 0.2110189,
 0.1658959,
 0.1776624,
 0.1605495,
 0.1214652,


In [81]:
data.to_csv('./consolidated_dataset/tweets.csv', index=False)