In [2]:
import nltk
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
import random 
import re #regular expressions allow you to capture patterns in text 
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [3]:
#Download tweet samples 
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [4]:
#Select a set of positive and negative tweets 
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [5]:
print(f"Number of positive tweets: {len(all_positive_tweets)}")
print(f"Number of negative tweets: {len(all_negative_tweets)}")

print(f"Type of all positive tweets: {type(all_positive_tweets)}")
print(f"Type of all negative tweets: {type(all_negative_tweets)}")

print(f"Tweet entry typeis : {type(all_negative_tweets[0])}")

Number of positive tweets: 5000
Number of negative tweets: 5000
Type of all positive tweets: <class 'list'>
Type of all negative tweets: <class 'list'>
Tweet entry typeis : <class 'str'>


In [6]:
#view random positive and negative tweets
#print positive in green and negative in red

print('\003[92m' + all_positive_tweets[random.randint(0,5000)])
print('\033[91m' + all_negative_tweets[random.randint(0,5000)])

[92mFriend's lunch... yummmm :)
#Nostalgia #TBS #KU.
[91mHas a poorly pup :(


In [7]:
#Select a sample tweet Complex enough to exemplify each step 
tweet = all_positive_tweets[2277]
print(tweet)

My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#remove old style retweet text RT
tweet2 = re.sub(r'^RT[\s]+', '', tweet)
#remove hyperlinks
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)
#remove hashtags
tweet2 = re.sub(r'#', '', tweet2)

print(tweet2)

My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 


In [10]:
#instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

In [11]:
tweet_tokens = tokenizer.tokenize(tweet2)
print(tweet_tokens)

['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']


In [12]:
#Import stopwords list

stopwords_english = stopwords.words('english')
print("\nStop words\n-------------------")
print(stopwords_english)

print("\nPunctuation List\n-------------------")
print(string.punctuation)


Stop words
-------------------
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only'

In [13]:
print()
print('\033[92m')
print(tweet_tokens)
print('\033[94m')

tweet_clean = []
for word in tweet_tokens:
    if word not in stopwords_english and word not in string.punctuation:
        tweet_clean.append(word)
        
print('removed stop words and punctuation')
print(tweet_clean)


[92m
['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']
[94m
removed stop words and punctuation
['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']


In [14]:
#Lemmatize our documents
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

#combines a loop and appending into the same line
#for token in tweet_clean
#lemmatized.append(lemmatizer.lematize(token)

lemmatized = [lemmatizer.lemmatize(token) for token in tweet_clean]
print(lemmatized)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...


['beautiful', 'sunflower', 'sunny', 'friday', 'morning', ':)', 'sunflower', 'favourite', 'happy', 'friday', '…']


In [16]:
print()
print('\033[92m')
print(tweet_clean)
print('\033[94m')

#Instantiate the Stemming Class
stemmer = PorterStemmer()

#create an empty list
tweets_stem = []

for word in lemmatized:
    stem_word = stemmer.stem(word)
    tweets_stem.append(stem_word)
    
print("Stemmed words")
print(tweets_stem)


[92m
['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']
[94m
Stemmed words
['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']
