In [1]:
import pandas as pd
import ast, json
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 
# Import Punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\My Probook
[nltk_data]     G2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
tweets_df= pd.read_csv('old_tweets.csv')
tweets_df.head()

Unnamed: 0,tweet_count,city,body,date,hashtags,link,geo
0,0,New York,@UberEats Promo Code never works for 50%. what...,2020-07-14 23:57:38+00:00,,https://twitter.com/FashionsWeek/status/128318...,
1,1,New York,Yea they’re terrible smh. @UberEats @Uber_Support,2020-07-14 23:36:33+00:00,,https://twitter.com/kensthetic_/status/1283183...,
2,2,New York,@UberEats when are you coming to upstate ny?,2020-07-14 23:33:33+00:00,,https://twitter.com/Thebobover/status/12831828...,
3,3,New York,@diginn why does the dig inn app and @UberEats...,2020-07-14 23:30:41+00:00,,https://twitter.com/Hot4TaterTots/status/12831...,
4,4,New York,For my brother to have his Uber job back! He h...,2020-07-14 23:04:47+00:00,,https://twitter.com/Rayofshine69/status/128317...,


# 1. Tokenizer + Lematizer

In [3]:
# Create function to tokenize the sentences
def tokenize_cleaner(x):
    x = str(x)
    mylist =  word_tokenize(x)
    return mylist

In [4]:
tweets_df['tokenized'] = tweets_df['body'].apply(tokenize_cleaner) 
tweets_df.head(2)

Unnamed: 0,tweet_count,city,body,date,hashtags,link,geo,tokenized
0,0,New York,@UberEats Promo Code never works for 50%. what...,2020-07-14 23:57:38+00:00,,https://twitter.com/FashionsWeek/status/128318...,,"[@, UberEats, Promo, Code, never, works, for, ..."
1,1,New York,Yea they’re terrible smh. @UberEats @Uber_Support,2020-07-14 23:36:33+00:00,,https://twitter.com/kensthetic_/status/1283183...,,"[Yea, they, ’, re, terrible, smh, ., @, UberEa..."


In [5]:
# wordnet is a lexical database for the English language that helps the script determine the base word. 
# You need the averaged_perceptron_tagger resource to determine the context of a word in a sentence
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to C:\Users\My Probook
[nltk_data]     G2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\My Probook G2\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# 2. Remove the noise

In [6]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [7]:
tweets_df['cleaned'] = tweets_df['tokenized'].apply(remove_noise)
tweets_df.head(2)
print(tweets_df['cleaned'][0])

['ubereats', 'promo', 'code', 'never', 'work', 'for', '50', 'what', 'do', 'i', 'do']


In [8]:
# # Create a lematize function to apply in the tokens

# # Obs: Before running a lemmatizer, we need to determine the context for each word in the text. 
# # This is achieved by a tagging algorithm, which assesses the relative position of a word in a sentence.

# def lemmatize_sentence(tokens):
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_sentence = []
#     for word, tag in pos_tag(tokens):
#         if tag.startswith('NN'):
#             pos = 'n'
#         elif tag.startswith('VB'):
#             pos = 'v'
#         else:
#             pos = 'a'
#         lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
#     return lemmatized_sentence

In [9]:
# tweets_df['lematized'] = tweets_df['tokenized'].apply(lemmatize_sentence)
# tweets_df.head(2)

#### Stopwords

In [10]:
# from nltk.corpus import stopwords

In [11]:
# # Create a colum only with strings to remove stop words
# tweets_df['tweets_str'] = [','.join(map(str, l)) for l in tweets_df['lematized']]
# # tweets_df.head(1)

In [12]:
# #Applying Conditions and removing stopwords
# stop = stopwords.words('english')
# pat = r'\b(?:{})\b'.format('|'.join(stop))

# tweets_df['cleaned'] = tweets_df['tweets_str'].str.replace(pat, '')
# tweets_df['cleaned'] = tweets_df['cleaned'].str.replace(r'\s+', ' ')

# print(tweets_df['tweets_str'][0])
# print(tweets_df['cleaned'][0])

#### Punctuation

In [13]:
# import re, string
# string.punctuation

In [14]:
# def remove_punct(text):
#     text  = ''.join([char for char in text if char not in string.punctuation])
#     text = re.sub('[0-9]+', '', text)
#     return text

# tweets_df['cleaned'] = tweets_df['cleaned'].apply(lambda x: remove_punct(x))
# tweets_df.head(2)

#### Links and entities

In [15]:
# # Converting into lits for modelling
# tweets_df['tweets_cleaned'] = tweets_df['cleaned'].apply(lambda x: x.strip('()').split(','))
# print(tweets_df['tweets_str'][0])
# print(tweets_df['cleaned'][0])
# print(tweets_df['tweets_cleaned'][0])

In [16]:
# tweets_df.to_csv('UberEats.csv', index = False)