In [1]:
import sqlite3 
import pandas as pd
import numpy as np
from unidecode import unidecode #unidecode converts emojis into text. Avoids encoding problems.

### Pandas dataframe from a SQL file

In [2]:
conn = sqlite3.connect("nlp.sqlite",check_same_thread=False)
c = conn.cursor()

In [3]:
sql = '''select * from trump 
                order by timestamp desc'''
    
df = pd.read_sql(sql, conn)

Unnamed: 0,timestamp,retweet,quote,quoted_text,tweet
0,1.587906e+12,1.0,0.0,,RT @dwiebe99: @TVietor08 @IvankaTrump Let me g...
1,1.587906e+12,1.0,0.0,,RT @w_terrence: Happy Birthday @FLOTUS Melania...
2,1.587906e+12,1.0,1.0,N. Korea Dictator Kim Jong-un Reportedly Dead ...,RT @Sotnas92: Donald Trump a l’enterrement: ht...
3,1.587906e+12,1.0,0.0,,RT @kylegriffin1: Republicans were taken aback...
4,1.587906e+12,0.0,0.0,,@Mankind123456 @RomainBurrel Vous parlez de Tr...
...,...,...,...,...,...
149563,1.587870e+12,0.0,0.0,,@ViceroyChicken @vadimnewquist @AubinMagnus @V...
149564,1.587870e+12,1.0,0.0,,RT @COsweda: It's GENUINELY true:\n\nNo matter...
149565,1.587870e+12,0.0,0.0,,Opinion | Trump’s efforts to bring out the wor...
149566,1.587870e+12,0.0,1.0,There has been an instant spike in ER visits o...,MD?


### Clean data

In [4]:
df = df[df['retweet'] == 0] #No RT´s

In [6]:
df = df.drop(['timestamp','retweet','quote'],axis=1) #Drop unnecessary columns

I will create another dataframe with the quoted_text only to concatenate to our tweet column, expanding the data pool.

In [8]:
qt = df.copy()

In [9]:
qt['quoted_text'].replace('', np.nan, inplace=True) #Converting blank rows to null values

In [10]:
qt.dropna(subset=['quoted_text'], inplace=True) #Dropping null values

In [12]:
qt = qt.drop('tweet',axis=1)

In [15]:
df = df.drop('quoted_text',axis=1)

In [16]:
qt = qt.rename(columns={"quoted_text": "tweet"})

In [17]:
df = pd.concat([df, qt],ignore_index =True) #Concat both dataframes

In [18]:
df

Unnamed: 0,tweet
0,@Mankind123456 @RomainBurrel Vous parlez de Tr...
1,@tedcruz Do you believe that trump telling peo...
2,@jjwasin @MikeRalph62 @Bitzy1221 @DonaldJTrump...
3,@Wct1964 @donnia68 @RetmarinemsgtM @BeingBekka...
4,"@washingtonpost CNN is Fake news,and Trump is ..."
...,...
49155,"WTP will be out in force on Nov. 3rd, 2020 and..."
49156,"Em quase todos os momentos, Donald Trump tomou..."
49157,IM NOT FUCKING KIDDING YOU WHEN I TELL YOU.......
49158,Donald Trump’s suggestion about disinfectant w...


11k new tweets

Now we need to clean the tweets:

- Dropping mentions, they dont add value to NLP tasks. Another option is to replace them with a tag. Regex for mentions incluided

- Removing whitespaces from the beggining

- Removing new line to avoid problem when converting to .csv

- Removing , to convert to .csv

- Converting white spaces to null and dropping them

In [19]:
df['tweet'] = df['tweet'].str.replace(r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)","", regex=True)
df['tweet'] = df['tweet'].str.replace(r"^\s+","", regex=True)
df = df.replace(r'\n',' ', regex=True)
df = df.replace(r",","", regex=True)
df['tweet'].replace('', np.nan, inplace=True)
df.isnull().sum()

tweet    183
dtype: int64

In [20]:
df = df.dropna()

In [21]:
df = df.rename(columns={"tweet": "reviewText"})

In [22]:
df['reviewText'] = df['reviewText'].apply(unidecode) #Applies unidecode to all tweets

In [23]:
#Again deleting new lines and commas, in case unidecode created new ones
df = df.replace(r'\n',' ', regex=True)
df = df.replace(r",","", regex=True)

In [24]:
#Saving into a csv file
df.to_csv('twitter.csv',index=False)