In [1]:
import nltk
from nltk.corpus import twitter_samples
import random
import re
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [2]:
nltk.download('twitter_samples') #Download nltk twitter sample file
nltk.download('stopwords') #Download stopwords

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
tweets = all_positive_tweets + all_negative_tweets  #make a list of all 10000 tweets
#tweets[946]

In [5]:
#Removing email addresses, hyperlinks, hashtags, RT(retweets)
preprocessed_tweets = []
for tweet in tweets:
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#','', tweet)
    tweet = re.sub(r'[a-zA-Z0-9-_]+@[a-zA-Z0-9-_]+.[a-zA-Z]+','', tweet)
    preprocessed_tweets.append(tweet)
#preprocessed_tweets[946]

In [6]:
#Tokenizing tweets
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True,  preserve_case=False)
tokenized_tweets =[]
for tweet in preprocessed_tweets:
    tokenized_tweets.append(tokenizer.tokenize(tweet))
#tokenized_tweets[946]

In [7]:
#Customizing stopwords list,keeping some words for meaning
english_stopwords = stopwords.words('english')
wanted_stopwords = [ 'don', "don't", 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'no', 'nor', 'not','ain']
for wanted_stopword in wanted_stopwords:
    if wanted_stopword in english_stopwords:
        english_stopwords.remove(wanted_stopword)

In [8]:
#Removing stopwords but keeping emoticon
clean_tweets = []
for tweet in tokenized_tweets:
    t = []
    for word in tweet:
        if word not in english_stopwords:
            t.append(word) 
      
    clean_tweets.append(t)

#clean_tweets[946]

In [9]:
#Stemming tweets
stemmer = PorterStemmer()
stemmed_tweets = []
for tweet in clean_tweets:
    stemmed_tweet = []
    for word in tweet:
        stemmed_tweet.append(stemmer.stem(word))
    stemmed_tweets.append(stemmed_tweet)
#stemmed_tweets[946]

**Resources:**         

Coursera Machine Learning Course by Andrew Ng

In this project, by using NLTK library will work on "twitter_samples" corpus to make the data ready for some NLP downstream tasks.



Following are the preprocessing steps with an example:

Ex: Any brands wanting me to review BEAUTY products on my blog? mail me at: ladolcevitainluxembourg@hotmail.com #prrequest #journorequest :)'


1.   Removing email addresses, hyperlinks, hashtags, RT(retweets)

Ex: Any brands wanting me to review BEAUTY products on my blog? mail me at:  prrequest journorequest :)

2.   Tokenization

Ex: ['any', 'brands', 'wanting', 'me', 'to', 'review', 'beauty', 'products', 'on', 'my', 'blog', '?', 'mail', 'me', 'at', ':', 'prrequest', 'journorequest', ':)']

3.   Removing stopwords but keeping emoticon

Ex: ['brands', 'wanting', 'review', 'beauty', 'products', 'blog', '?', 'mail' ':', 'prrequest', 'journorequest', ':)']

4.   Stemming

Ex: ['brand', 'want', 'review', 'beauti', 'product', 'blog', '?', 'mail', ':', 'prrequest', 'journorequest', ':)']


