In [None]:
from time import time
import pandas as pd
import numpy as np
import re
import sys
import csv
csv.field_size_limit(sys.maxsize)

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

## Data Loading and Cleaning
In this first step we will be loading the data and cleaning it as well 

In [None]:
df_raw = pd.read_csv('tweets.csv', delimiter=';', skiprows=0, lineterminator='\n' )

In [3]:
df_raw.columns = ["id", "user", "fullname", "url", "timestamp", "replies","likes","retweets","text"]
print(df_raw.head())

             id           user             fullname  url  \
0  1.132977e+18   KamdemAbdiel        Abdiel kamdem  NaN   
1  1.132977e+18      bitcointe            Bitcointe  NaN   
2  1.132977e+18      3eyedbran  Bran - 3 Eyed Raven  NaN   
3  1.132977e+18  DetroitCrypto          J. Scardina  NaN   
4  1.132977e+18   mmursaleen72   Muhammad Mursaleen  NaN   

                timestamp  replies  likes  retweets  \
0  2019-05-27 11:49:14+00        0      0         0   
1  2019-05-27 11:49:18+00        0      0         0   
2  2019-05-27 11:49:06+00        0      2         1   
3  2019-05-27 11:49:22+00        0      0         0   
4  2019-05-27 11:49:23+00        0      0         0   

                                                text  
0  È appena uscito un nuovo video! LES CRYPTOMONN...  
1  Cardano: Digitize Currencies; EOS https://t.co...  
2  Another Test tweet that wasn't caught in the s...  
3  Current Crypto Prices! \n\nBTC: $8721.99 USD\n...  
4  Spiv (Nosar Baz): BITCOIN Is A

In [4]:
# We only need the timestamp and the text from the dataset
# df = df_raw[['timestamp','text']]
df = df_raw
print(df.sample(5))

                    id             user             fullname  url  \
8291697   1.150480e+18     AndrewKiguel        Andrew Kiguel  NaN   
12600655  1.172777e+18     MrMichaelNye                  Nye  NaN   
14825758  1.185992e+18   coolLaTechJobs         Tech Jobs LA  NaN   
15583152  1.190200e+18       vochung333             仮想通貨★さゆみ  NaN   
8983943   1.153780e+18  DTradingAcademy  Day Trading Academy  NaN   

                       timestamp  replies  likes  retweets  \
8291697   2019-07-14 19:00:13+00        0      0         0   
12600655  2019-09-14 07:41:41+00        0      0         0   
14825758  2019-10-20 18:51:10+00        0      0         0   
15583152  2019-11-01 09:31:02+00        0      0         0   
8983943   2019-07-23 21:33:21+00        0      0         0   

                                                       text  
8291697   If you have an optimistic mindset about where ...  
12600655  I can’t even remember the last time $BTC had a...  
14825758  Insurance Repres

In [5]:
df.timestamp.dtypes

dtype('O')

In [6]:
# Transform the dates to a more friendly format (only care about the day)
df['date'] = pd.to_datetime(df['timestamp'],format= '%Y-%m-%d').dt.date
print(df.columns)
df['date']

Index(['id', 'user', 'fullname', 'url', 'timestamp', 'replies', 'likes',
       'retweets', 'text', 'date'],
      dtype='object')


0           2019-05-27
1           2019-05-27
2           2019-05-27
3           2019-05-27
4           2019-05-27
               ...    
16889760    2019-11-23
16889761    2019-11-23
16889762    2019-11-23
16889763    2019-11-23
16889764    2019-11-23
Name: date, Length: 16889765, dtype: object

In [7]:
# Sort the dataframe by the date
df = df.set_index('date')
df = df.sort_values(by='date')
print(df.head())

                      id          user   fullname  url  \
date                                                     
2007-04-19  3.286741e+07  chrispychong    chrispy  NaN   
2009-01-11  1.110303e+09        halfin     halfin  NaN   
2009-01-21  1.136750e+09        halfin     halfin  NaN   
2009-01-27  1.153097e+09        halfin     halfin  NaN   
2009-01-29  1.158417e+09   fafcffacfff  GoldLover  NaN   

                         timestamp  replies  likes  retweets  \
date                                                           
2007-04-19  2007-04-19 07:14:38+00        0      0         2   
2009-01-11  2009-01-11 03:33:52+00      790  14470      5542   
2009-01-21  2009-01-21 17:29:40+00       55   1544       392   
2009-01-27  2009-01-27 20:14:10+00       44   1042       277   
2009-01-29  2009-01-29 13:37:53+00        0     28        16   

                                                         text  
date                                                           
2007-04-19  is h

In [8]:
startdate = pd.to_datetime("2017-12-01").date()
df = df.loc[startdate:]
print(df.tail())
print(df.columns)

                      id             user          fullname  url  \
date                                                               
2019-11-23  1.198116e+18        ScrapeINT  AutomatedReports  NaN   
2019-11-23  1.198114e+18       holmesyoyo          あけみ☆仮想通貨  NaN   
2019-11-23  1.198114e+18  MasterminingNet     Master Mining  NaN   
2019-11-23  1.198114e+18      xrp36636596          エビマヨコロリン  NaN   
2019-11-23  1.198266e+18       HaraldoXRP             Harry  NaN   

                         timestamp  replies  likes  retweets  \
date                                                           
2019-11-23  2019-11-23 05:47:01+00        0      0         0   
2019-11-23  2019-11-23 05:40:05+00        0      0         0   
2019-11-23  2019-11-23 05:40:05+00        0      0         0   
2019-11-23  2019-11-23 05:39:55+00        1      8         3   
2019-11-23  2019-11-23 15:45:06+00        0      2         1   

                                                         text  
date      

In [9]:
#pip install whatthelang
#!pip install swifter

In [10]:
from langdetect import detect
lang = ''
print(len(df['text']))
i = 0
# df['text'].replace('', np.nan, inplace=True)
# df.dropna(subset=['text'], inplace=True)
re = '^((https?|ftp|smtp):\/\/)?(www.)?[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*$'
re = '/https?:\/\/w{0,3}\w*?\.(\w*?\.)?\w{2,3}\S*|www\.(\w*?\.)?\w*?\.\w{2,3}\S*|(\w*?\.)?\w*?\.\w{2,3}[\/\?]\S*/'
filter = df['text'].str.contains(re)
# mask = df['text'].str.match(re)
df_new = df[~filter]
print(len(df_new['text']))

15549700


  return func(self, *args, **kwargs)


15083594


In [None]:
# import swifter
import tqdm
from langdetect import detect
# df_new['lang'] = df_new['text'].apply(lambda x: detect(x))
i = 0
df_en = df
for index, row in tqdm.tqdm(df_en['text'].iteritems()):
  if i % 100 == 0:
    try:
      lang = detect(row) #detecting each row
    except:
      lang = 'no'
    df_en.loc[index, 'lang'] = lang
  i += 1

552201it [35:55, 253.29it/s]

In [None]:
print(df_en)

In [None]:
df_en_clean = df_en[df_en["lang"] == 'en']
print(df_en_clean)

In [None]:
# checking for null values, if any
df_en_clean.isnull().sum()

In [None]:
#ditching all row when text is null, as need text for analysis
df_en_clean.dropna(how='any', inplace=True)
df_en_clean.sample(3)

In [None]:
df_en_clean.info()

In [None]:
df_en_clean.to_csv('tweets_clean.csv')
