# Data Preprocessing

In [14]:
import pandas as pd
import numpy as np
import re
import spacy
import gensim
from tqdm import tqdm
from typing import List
from spacy.tokens import Doc

In [94]:
# Load data
tweet_df = pd.read_csv('Data/tweets_climate_change.csv')

## Creating Preprocessing Methods for Spacy Model

In [3]:
class SpacyPreprocessor:
    
    def __init__(self, spacy_model = None, remove_nums = True,
                remove_special = True, remove_stopwords = True, 
                lemmatise = True):
        
        self.remove_nums = remove_nums
        self.remove_special = remove_special
        self.remove_stopwords = remove_stopwords
        self.lemmatise = lemmatise
        
        if not spacy_model:
            self.model = spacy.load('en_core_web_sm')
        else:
            self.model = spacy_model
    
    @staticmethod
    def download_spacy_model(model = 'en_core_web_sm'):
        print(f'Downloading spaCy model {model}')
        spcy.cli.download(model)
        print(f'Finished downloading model')
        
    @staticmethod
    def load_model(model = 'en_core_web_sm'):
        
        return spacy.load(model, disable = ['ner', 'parser'])
  
    
    def tokenise(self, text) -> List[str]:
        doc = self.model(text)
        
        return [token.text for token in doc]
    
    def preprocess_text(self, text) -> str:
        doc = self.model(text)
        
        return self.clean_text(doc)
    
    def preprocess_text_list(self, texts = List[str]) -> List[str]:
        texts_cleaned = []
        for doc in tqdm(self.model.pipe(texts)):
            texts_cleaned.append(self.clean_text(doc))
            
        return texts_cleaned
    
    def clean_text(self, doc: Doc) -> str:
        
        tokens = []
        
        # Remove numbers
        if self.remove_nums:
            for token in doc:
                if not (token.like_num or token.is_currency):
                    tokens.append(token)
        
        # Remove stopwords
        if self.remove_stopwords:
            tokens = [token for token in tokens if not token.is_stop]
            
        # Remove unwanted tokens
        tokens = [
            token for token in tokens
            if not ( 
                token.is_punct or token.is_space or token.is_quote or token.is_bracket
            )
        ]
        
        # Remove empty tokens
        tokens = [token for token in tokens if token.text.strip() != '']
        
        # Remove 'amp' tokens
        tokens = [token for token in tokens if not token.text == 'amp']

        # Lemmatise
        if self.lemmatise:
            text = ' '.join([token.lemma_ for token in tokens])
        else:
            text = ' '.join([token.text for token in tokens])
            
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        
        # Remove non alphabetic characters
        if self.remove_special:
            text = re.sub(r'[^a-zA-Z\']', ' ', text)
            
        # Remove non-Unicode characters
        #text = re.sub(r"[^\x00-\x7F]+", "", text)
        
        text = text.lower()
        tokens = self.tokenise(text)
        tokens = [token for token in tokens if token.strip() != ''] # filter out empty tokens again
        
        return text, tokens
         
    
#if __name__ == '__main__':
    #spacy_model = SpacyPreprocessor.load_model()
    #preprocessor = SpacyPreprocessor(spacy_model = spacy_model)
            

In [138]:
preprocessor = SpacyPreprocessor()
cleaned_tweets = preprocessor.preprocess_text_list(tweet_df.Tweet)

67406it [10:34, 106.17it/s]


In [139]:
# unpack the list of cleaned tweets
cleaned_tweets_text = [item[0] for item in cleaned_tweets]
cleaned_tweets_token = [item[1] for item in cleaned_tweets]

In [140]:
tweet_df['Processed_text'] = cleaned_tweets_text
tweet_df['Processed_token'] = cleaned_tweets_token

In [149]:
tweet_df

Unnamed: 0,ID,Name,Tweet,Processed_text,Processed_token
0,18257804,stateless,"we’re pretty lucky, all things considered, whe...",pretty lucky thing consider compare place clim...,"[pretty, lucky, thing, consider, compare, plac..."
1,1246325069841723392,TsaiJilly,#UN75 survey found that respondents in all reg...,un survey find respondent region identify cl...,"[un, survey, find, respondent, region, identif..."
2,1248988647812222978,Beatric54184322,"All hat, no policy #climatechange #insiders ht...",hat policy climatechange insider,"[hat, policy, climatechange, insider]"
3,240072798,LugubriousLarry,Two great stories on #Maine this weekend: firs...,great story maine weekend important piece jan...,"[great, story, maine, weekend, important, piec..."
4,1124447266205503488,All435Reps,The evidence is right in front of us. Temperat...,evidence right temperature get hot climatechan...,"[evidence, right, temperature, get, hot, clima..."
...,...,...,...,...,...
67401,1329260903527718913,Jillian18277886,"Once we get through #COVID, let's not forget w...",covid let forget go die climatechange,"[covid, let, forget, go, die, climatechange]"
67402,19435213,wildweatherdan,"At the end of the last Ice Age, people changed...",end ice age people change clothe literally cli...,"[end, ice, age, people, change, clothe, litera..."
67403,1401488848635502592,AnonWatchers,"Coral #reefs ,rainforest of the ocean\n🐠\n#cli...",coral reef rainforest ocean climatechange co...,"[coral, reef, rainforest, ocean, climatechange..."
67404,18027211,dennissweatt,"What is #climatechange? \n\nWets are wetter, d...",climatechange wet wetter dry dryer hot hot col...,"[climatechange, wet, wetter, dry, dryer, hot, ..."


## Phrase Modelling : Biagram and Trigram

In [65]:
def make_bigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count = 10, threshold = 100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    trigram = gensim.models.Phrases(bigram[texts], threshold = 100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatisation(texts, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [6]:
preprocessor = SpacyPreprocessor(lemmatise = False)
cleaned_tweets = preprocessor.preprocess_text_list(tweet_df.Tweet)

67406it [10:12, 110.10it/s]


In [56]:
cleaned_tweets_text = [item[0] for item in cleaned_tweets]
cleaned_tweets_token = [item[1] for item in cleaned_tweets]
bigram_tweets = make_bigrams(cleaned_tweets_token)

In [66]:
bigram_tweets = lemmatisation(bigram_tweets, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV'])

In [76]:
tweet_df['Bigram'] = bigram_tweets

In [83]:
tweet_df.to_csv('Data/tweet_climate_change_processed.csv', index = False)