# Tweet Preprocessing
#### Basic text-preprocessing pipeline (in no particular order):
- Detect and translate tweets to English
- Tokenization
- Stopword removal & Lemmatization
- Remove URLs and reserved words (RTs)
- Lowercasing
- Remove # and @ symbols but keep values
- Spell Checker
- Remove punctuation (possibly, although useful for tweet fragmentation)

In [142]:
import os
import pandas as pd
import numpy as np
import preprocessor as prep
from os.path import join
from sqlite3 import connect
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from googletrans import Translator
from nltk.corpus import stopwords
from nltk.util import ngrams
from spellchecker import SpellChecker
import nltk
import string

pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Getting Data (tweets)

In [143]:
project_dir = join(os.getcwd(), os.pardir)
raw_dir = join(project_dir, 'data', 'raw')
interim_dir = join(project_dir, 'data', 'interim')
db_name = 'data_pull_sample.db'

%config InlineBackend.figure_format = 'svg'

In [144]:
conn = connect(join(interim_dir, db_name))
df_tweets = pd.read_sql('SELECT * FROM tweets', conn)
df_tweets.drop_duplicates(subset='id_str', inplace=True)
df_tweets.loc[:,'is_original'] = ~df_tweets[['is_reply', 'is_retweet', 'is_quote_status']].sum(1).astype(bool)

In [145]:
df_tweets.shape

(20500, 19)

In [146]:
df_tweets.head()

Unnamed: 0,created_at,id_str,source,in_reply_to_status_id_str,in_reply_to_user_id_str,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,lang,possibly_sensitive,quoted_status_id_str,user_id_str,full_text,is_retweet,original_tweet_id_str,is_reply,is_original
0,2020-06-14 23:57:21+00:00,1272317232626888704,"<a href=""http://twitter.com/download/android"" ...",1.272317e+18,4844328000.0,0,0,1,0,0,ja,,,1065957356079476736,@Ampan_C そのコメント欄に必ず私が居た‼️🤔🙄🙄🙄\n自分で質問して自分が最初にコメ...,0,,1,False
1,2020-06-14 23:51:31+00:00,1272315765975183360,"<a href=""http://twitter.com/download/android"" ...",1.272312e+18,4844328000.0,0,0,1,0,0,ja,,,1065957356079476736,@Ampan_C X軸とY軸が有るグラフで虚数を掛けると90度回転する‼️🤔,0,,1,False
2,2020-06-14 23:51:02+00:00,1272315644495581184,"<a href=""http://twitter.com/download/android"" ...",,,0,0,0,0,0,en,,,901828520,RT @IYCWestBengal: Today #WBPYC President @Sha...,1,1.272197734913544e+18,0,False
3,2020-06-14 23:50:39+00:00,1272315545027637248,"<a href=""http://twitter.com/download/android"" ...",,,0,0,0,0,0,en,,,921596113949417472,RT @IYCWestBengal: Today #WBPYC President @Sha...,1,1.272197734913544e+18,0,False
4,2020-06-14 23:49:53+00:00,1272315355700981760,"<a href=""http://twitter.com/download/iphone"" r...",1.272315e+18,4844328000.0,0,0,1,0,0,ja,,,906562306401755136,@Ampan_C あんぱんおぱよー！,0,,1,False


### Translate Tweets

In [147]:
def translate_tweet(text, lang):
    trans = Translator()
    return trans.translate(text).text

In [149]:
for i in df_tweets.index:
    if df_tweets['lang'][i]!='en':
        df_tweets.loc[i,'full_text'] = translate_tweet(df_tweets['full_text'][i], df_tweets['lang'][i])
        df_tweets.loc[i, 'lang'] = 'en'

### Removing URLs and Reserved Words (RTs)

In [150]:
prep.set_options(prep.OPT.URL, prep.OPT.RESERVED)

In [151]:
df_tweets['full_text'] = df_tweets['full_text'].apply(lambda x: prep.clean(x))

### Lowercasing & Punctuation Removal

In [152]:
df_tweets['full_text'] = df_tweets['full_text'].apply(lambda x: x.lower())

In [153]:
def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [154]:
df_tweets['full_text'] = df_tweets['full_text'].apply(lambda x: remove_punct(x))

### Lemmatization & Stopword removal

In [155]:
lemmatizer = nltk.stem.WordNetLemmatizer()
df_tweets['full_text'] = df_tweets['full_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

In [156]:
stop_words = set(stopwords.words('english'))
df_tweets['full_text'] = df_tweets['full_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

### Spell Checker

In [157]:
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [158]:
##Taking too much time to execute

##df_tweets['full_text'] = df_tweets['full_text'].apply(lambda x: correct_spellings(x))

### Loading into Database

In [161]:
f_name = 'processed_tweets.db'

with connect(join(interim_dir, f_name)) as connection:
    df_tweets.to_sql('tweets', connection, index=False, if_exists='replace')