In [1]:
import pandas as pd
import nltk.downloader
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [3]:
data = pd.read_csv("./data/twitter_sentiment_data.csv")

In [5]:
#showing data format
data.head(10)

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153
5,0,Unamshow awache kujinga na iko global warming ...,793125429418815489
6,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125430236684289
7,2,RT @CCIRiviera: Presidential Candidate #Donald...,793126558688878592
8,0,RT @AmericanIndian8: Leonardo DiCaprio's clima...,793127097854197761
9,1,#BeforeTheFlood Watch #BeforeTheFlood right he...,793127346106753028


In [6]:
dfTweets = data["message"]

In [7]:
def createTokenizedArray(sentences):

    # Initialize tokenizer and empty array to store modified sentences.
    tokenizer = RegexpTokenizer(r'\w+')
    tokenizedArray = []
    for i in range(0, len(sentences)):
        # Convert sentence to lower case.
        sentence = sentences[i].lower()

        # Split sentence into array of words with no punctuation.
        words = tokenizer.tokenize(sentence)

        # Append word array to list.
        tokenizedArray.append(words)

    # print(tokenizedArray)
    return tokenizedArray  # send modified contents back to calling function.

In [8]:
tokenizedLi = createTokenizedArray(dfTweets)

pd.Series(tokenizedLi[:5])

0    [tiniebeany, climate, change, is, an, interest...
1    [rt, natgeochannel, watch, beforetheflood, rig...
2    [fabulous, leonardo, dicaprio, s, film, on, cl...
3    [rt, mick_fanning, just, watched, this, amazin...
4    [rt, cnalive, pranita, biswasi, a, lutheran, f...
dtype: object

In [9]:
# To get stop words.
nltk.download('stopwords')

def removeStopWords(tokenList):

    stopWords = set(stopwords.words('english'))
    shorterSentences = []  # Declare empty array of sentences.

    for sentence in tokenList:
        shorterSentence = []  # Declare empty array of words in single sentence.
        for word in sentence:
            if word not in stopWords:

                # Remove leading and trailing spaces.
                word = word.strip()

                # Ignore single character words and digits.
                if (len(word) > 1 and word.isdigit() == False):
                    # Add remaining words to list.
                    shorterSentence.append(word)
        shorterSentences.append(shorterSentence)
    return shorterSentences

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kristi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
tokenizedNoStopLi = removeStopWords(tokenizedLi)

print(f"Sample sentence BEFORE removing stop words:\n{tokenizedLi[0]}")
print(f"\n\nSample sentence AFTER removing stop words:\n{tokenizedNoStopLi[0]}")
display(tokenizedNoStopLi)

Sample sentence BEFORE removing stop words:
['tiniebeany', 'climate', 'change', 'is', 'an', 'interesting', 'hustle', 'as', 'it', 'was', 'global', 'warming', 'but', 'the', 'planet', 'stopped', 'warming', 'for', '15', 'yes', 'while', 'the', 'suv', 'boom']


Sample sentence AFTER removing stop words:
['tiniebeany', 'climate', 'change', 'interesting', 'hustle', 'global', 'warming', 'planet', 'stopped', 'warming', 'yes', 'suv', 'boom']


[['tiniebeany',
  'climate',
  'change',
  'interesting',
  'hustle',
  'global',
  'warming',
  'planet',
  'stopped',
  'warming',
  'yes',
  'suv',
  'boom'],
 ['rt',
  'natgeochannel',
  'watch',
  'beforetheflood',
  'right',
  'leodicaprio',
  'travels',
  'world',
  'tackle',
  'climate',
  'change',
  'https',
  'co',
  'lkdehj3tnn',
  'httã'],
 ['fabulous',
  'leonardo',
  'dicaprio',
  'film',
  'climate',
  'change',
  'brilliant',
  'watch',
  'https',
  'co',
  '7rv6brmxjw',
  'via',
  'youtube'],
 ['rt',
  'mick_fanning',
  'watched',
  'amazing',
  'documentary',
  'leonardodicaprio',
  'climate',
  'change',
  'think',
  'thisã',
  'https',
  'co',
  'knste8k8im'],
 ['rt',
  'cnalive',
  'pranita',
  'biswasi',
  'lutheran',
  'odisha',
  'gives',
  'testimony',
  'effects',
  'climate',
  'change',
  'amp',
  'natural',
  'disasters',
  'poã'],
 ['unamshow',
  'awache',
  'kujinga',
  'na',
  'iko',
  'global',
  'warming',
  'https',
  'co',
  'mhiflu7m1x'],
 ['rt',
 

In [40]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatizelist(tokenList):

    lemmatizer = WordNetLemmatizer()
    lemmaSentences = []

    for sentence in tokenList:
        lemmaSentence = []  
        for word in sentence: 
                word = lemmatizer.lemmatize(word)
                     
                lemmaSentence.append(word)

        lemmaSentences.append(lemmaSentence)
    return lemmaSentences


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kristi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [41]:
nltk.download('omw-1.4')
lemmaLi = lemmatizelist(tokenizedNoStopLi)
display(lemmaLi)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kristi\AppData\Roaming\nltk_data...


[['tiniebeany',
  'climate',
  'change',
  'interesting',
  'hustle',
  'global',
  'warming',
  'planet',
  'stopped',
  'warming',
  'yes',
  'suv',
  'boom'],
 ['rt',
  'natgeochannel',
  'watch',
  'beforetheflood',
  'right',
  'leodicaprio',
  'travel',
  'world',
  'tackle',
  'climate',
  'change',
  'http',
  'co',
  'lkdehj3tnn',
  'httã'],
 ['fabulous',
  'leonardo',
  'dicaprio',
  'film',
  'climate',
  'change',
  'brilliant',
  'watch',
  'http',
  'co',
  '7rv6brmxjw',
  'via',
  'youtube'],
 ['rt',
  'mick_fanning',
  'watched',
  'amazing',
  'documentary',
  'leonardodicaprio',
  'climate',
  'change',
  'think',
  'thisã',
  'http',
  'co',
  'knste8k8im'],
 ['rt',
  'cnalive',
  'pranita',
  'biswasi',
  'lutheran',
  'odisha',
  'give',
  'testimony',
  'effect',
  'climate',
  'change',
  'amp',
  'natural',
  'disaster',
  'poã'],
 ['unamshow',
  'awache',
  'kujinga',
  'na',
  'iko',
  'global',
  'warming',
  'http',
  'co',
  'mhiflu7m1x'],
 ['rt',
  'cnaliv

In [42]:
nltk.download('words')

def removeWords(sentenceArrays):
     engwords = set(nltk.corpus.words.words())
     engSentences = []

     for sentence in sentenceArrays:
         engSentence = []  
         for word in sentence:  
             if word in engwords:                 
                 engSentence.append(word)

         engSentences.append(engSentence)

     return engSentences

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Kristi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [43]:
removenonengLi = removeWords(lemmaLi)
display(removenonengLi)

[['climate',
  'change',
  'interesting',
  'hustle',
  'global',
  'warming',
  'planet',
  'stopped',
  'warming',
  'yes',
  'boom'],
 ['watch', 'right', 'travel', 'world', 'tackle', 'climate', 'change'],
 ['fabulous', 'film', 'climate', 'change', 'brilliant', 'watch', 'via'],
 ['watched', 'amazing', 'documentary', 'climate', 'change', 'think'],
 ['give', 'testimony', 'effect', 'climate', 'change', 'natural', 'disaster'],
 ['na', 'global', 'warming'],
 ['give', 'testimony', 'effect', 'climate', 'change', 'natural', 'disaster'],
 ['presidential', 'candidate', 'climate', 'change', 'say', 'prince'],
 ['climate', 'change', 'documentary', 'free', 'week', 'indigenous'],
 ['watch', 'right', 'travel', 'world', 'tackle', 'climate', 'change'],
 ['vital',
  'public',
  'health',
  'community',
  'address',
  'climate',
  'change',
  'via'],
 ['cause',
  'climate',
  'change',
  'country',
  'need',
  'instead',
  'need',
  'compensation'],
 ['watch', 'right', 'travel', 'world', 'tackle', 'clim

In [44]:
def stemWords(sentenceArrays):
    '''
    Removes suffixes and rebuilds the sentences.
    :param sentenceArrays: stentences list
    :return: array of sentences without suffixes
    '''
    ps = PorterStemmer()
    stemmedSentences = []
    for sentenceArray in sentenceArrays:
        stemmedArray = []  # Declare empty array of words.
        for word in sentenceArray:
            stemmedArray.append(ps.stem(word))  # Add stemmed word.

        # Convert array back to sentence of stemmed words.
        delimeter = ' '
        sentence = delimeter.join(stemmedArray)

        # Append stemmed sentence to list of sentences.
        stemmedSentences.append(sentence)
    return stemmedSentences

In [45]:
stemmedLi = stemWords(removenonengLi)

print(f"Sample sentence BEFORE stemming:\n{removenonengLi[0]}")
print(f"\nSample sentence AFTER stemming:\n{stemmedLi[0]}")

display(stemmedLi)

Sample sentence BEFORE stemming:
['climate', 'change', 'interesting', 'hustle', 'global', 'warming', 'planet', 'stopped', 'warming', 'yes', 'boom']

Sample sentence AFTER stemming:
climat chang interest hustl global warm planet stop warm ye boom


['climat chang interest hustl global warm planet stop warm ye boom',
 'watch right travel world tackl climat chang',
 'fabul film climat chang brilliant watch via',
 'watch amaz documentari climat chang think',
 'give testimoni effect climat chang natur disast',
 'na global warm',
 'give testimoni effect climat chang natur disast',
 'presidenti candid climat chang say princ',
 'climat chang documentari free week indigen',
 'watch right travel world tackl climat chang',
 'vital public health commun address climat chang via',
 'caus climat chang countri need instead need compens',
 'watch right travel world tackl climat chang',
 'time need strong work hit climat chang poverti',
 'reflect new climat chang film',
 'problem global reason resist illustr even street flood',
 'one easiest way help combat climat chang daili life stop eat beef',
 'esquir watch climat chang doc free world end',
 'climat chang affect sea level rise flood citi world',
 'watch right travel world tackl climat chang',

In [46]:
mod_tfidf  = TfidfVectorizer(min_df=0, max_features=3000, strip_accents='unicode',lowercase =True, 
                            analyzer='word', token_pattern=r'\w{3,}', ngram_range=(1,1), 
                            use_idf= True,smooth_idf=True, sublinear_tf=True)    
matrix = mod_tfidf.fit_transform(stemmedLi)

features = pd.DataFrame(matrix.toarray(), columns=mod_tfidf.get_feature_names())
display(features)



Unnamed: 0,abandon,abil,abl,abort,abrupt,abruptli,absolut,absorb,abstract,absurd,...,yep,yesterday,yet,yield,york,young,youth,zealot,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
features.columns

Index(['abandon', 'abil', 'abl', 'abort', 'abrupt', 'abruptli', 'absolut',
       'absorb', 'abstract', 'absurd',
       ...
       'yep', 'yesterday', 'yet', 'yield', 'york', 'young', 'youth', 'zealot',
       'zero', 'zone'],
      dtype='object', length=3000)