In [None]:
#@Source AFO
#1 for supporting, 0 for rejecting, 2 for irrelevant
#300 key words, 77.5% accuracy
import pandas as pd
import numpy as np
import bs4 as bs
import nltk
from nltk.tokenize import sent_tokenize # tokenizes sentences
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

eng_stopwords = stopwords.words('english')

In [2]:
abstracts = pd.read_csv("GlobalWarming abstracts.csv")

In [3]:
abstracts.head()

Unnamed: 0,Title,Abstract,sentiment,FullContent
0,"1991,A 20-year Record Of Alpine Grasshopper Ab...",,4,
1,"1991,A Geological Perspective On Climatic-chan...",,4,
2,"1991,A Spatial Model For Studying The Effects ...",,4,
3,"1991,Abrupt Deep-sea Warming| Palaeoceanograph...",,4,
4,"1991,Advance Of East Antarctic Outlet Glaciers...",,4,


In [4]:
abstracts.drop(columns='FullContent', inplace=True)

In [5]:
abstracts.dropna(inplace=True)

In [6]:
abstracts.sentiment.value_counts()

4    511
3    169
2     52
5     50
7      8
6      8
1      3
Name: sentiment, dtype: int64

In [7]:
abstracts.head()

Unnamed: 0,Title,Abstract,sentiment
30,"1991,Comparisons Of Observed Northern-hemisphe...","GEOPHYSICAL RESEARCH LETTERS, VOL. 18, NO. 7, ...",5
55,"1991,Global Climate Change,Journal Of Engineer...",This chapter presents a bibliography of goal p...,6
58,"1991,Global Warming - What Does The Science Te...",A review of goal programming formulations and ...,6
64,"1991,Global Warming As A Manifestation Of A Ra...",Global and hemispheric series of surface tempe...,5
214,"1992,Global Warming - A Reduced Threat,Bulleti...",One popular and apocalyptic vision of the worl...,5


In [8]:
abstracts.sentiment.replace(to_replace=[1,2,3],value=1,inplace=True)
abstracts.sentiment.replace(to_replace=[5,6,7],value=0,inplace=True)
abstracts.sentiment.replace(to_replace=[4],value=2,inplace=True)
abstracts.sentiment.value_counts()
abstracts['index']=abstracts.index

In [9]:
balancedSet=abstracts[abstracts['sentiment']==0]

In [10]:
def sample(series, n):
    np.random.seed()
    return list(np.random.choice(series, size=n, replace=False))

In [11]:
irrelevantList = abstracts[abstracts.sentiment==2].agg(lambda x: sample(x, 66))['index']

In [12]:
balancedSet=balancedSet.append(abstracts[abstracts['index'].isin(irrelevantList)])

In [13]:
supportingList=abstracts[abstracts.sentiment==1].agg(lambda x: sample(x, 66))['index']

In [14]:
balancedSet=balancedSet.append(abstracts[abstracts['index'].isin(supportingList)])

In [15]:
balancedSet['sentiment'].value_counts()

2    66
1    66
0    66
Name: sentiment, dtype: int64

In [16]:
print(np.mean(balancedSet['Abstract'].apply(len)))

1443.8181818181818


In [17]:
balancedSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 198 entries, 30 to 10191
Data columns (total 4 columns):
Title        198 non-null object
Abstract     198 non-null object
sentiment    198 non-null int64
index        198 non-null int64
dtypes: int64(2), object(2)
memory usage: 7.7+ KB


In [18]:
balancedSet.set_index(np.arange(0,198,1),inplace=True)

In [19]:
from nltk.corpus import stopwords
def abstract_cleaner(abstract):
    #1. Remove HTML tags
    abstract = bs.BeautifulSoup(abstract).text
    #abstract = data['abstract'].apply(lambda text: bs.BeautifulSoup(text, 'html.parser').get_text())
    
    #2. Use regex to find emoticons
    #emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)
    
    #3. Remove punctuation
    abstract = re.sub("[^a-zA-Z0-9]", " ",abstract)
    
    #4. Tokenize into words (all lower case)
    #abstract = abstract.str.lower()
    abstract = abstract.lower().split()
   
    #5. Remove stopwords
    eng_stopwords = set(stopwords.words("english"))
    abstract = [w for w in abstract if not w in eng_stopwords]
    
    #6. Join the review to one sentence
    #review = ' '.join(review+emoticons)
    abstract = ' '.join(abstract)
    # add emoticons to the end
    return(abstract)

In [20]:
%%time

num_abstract = len(balancedSet)

abstract_clean_original = []

for i in range(0,num_abstract):
    abstract_clean_original.append(abstract_cleaner(balancedSet['Abstract'][i]))

CPU times: user 152 ms, sys: 43.3 ms, total: 196 ms
Wall time: 228 ms


In [33]:
%%time
# Porter stemming on the results in review_clean_original

abstract_clean_ps = []

ps = PorterStemmer()
for i in range(0,num_abstract):
    if( (i+1)%500 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    ps_stems = []
    for w in abstract_clean_original[i].split():
        if w == 'oed':
            continue
        ps_stems.append(ps.stem(w))
    
    abstract_clean_ps.append(' '.join(ps_stems))

CPU times: user 542 ms, sys: 4.85 ms, total: 547 ms
Wall time: 545 ms


In [34]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [35]:
%%time
# Lemmatizer

abstract_clean_wnl = []

wnl = WordNetLemmatizer()

for i in range(0,num_abstract):
    wnl_stems = []
    token_tag = pos_tag(abstract_clean_original[i].split())
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)

    abstract_clean_wnl.append(' '.join(wnl_stems))

CPU times: user 1.66 s, sys: 11.7 ms, total: 1.67 s
Wall time: 1.68 s


In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics # for confusion matrix, accuracy score etc
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(\
    abstract_clean_original, balancedSet['sentiment'], random_state=0, test_size=.2)


# CountVectorizer can actucally handle a lot of the preprocessing for us
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 300)

In [88]:
%%time
vectorizer.fit(X_train)

CPU times: user 32.6 ms, sys: 1.61 ms, total: 34.2 ms
Wall time: 33.1 ms


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=300, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [89]:
print(vectorizer.get_feature_names()[:])

['10', '100', '16', '20', '2000', '2004', '30', '40', 'accumulation', 'activity', 'addition', 'affect', 'air', 'al', 'also', 'although', 'among', 'amount', 'analysis', 'analyze', 'annual', 'anthropogenic', 'appear', 'application', 'approach', 'arctic', 'area', 'assessment', 'associate', 'atlantic', 'atmosphere', 'atmospheric', 'average', 'balance', 'base', 'basin', 'biomass', 'carbon', 'case', 'cause', 'century', 'ch4', 'change', 'china', 'circulation', 'climate', 'climatic', 'cloud', 'co2', 'compare', 'comparison', 'component', 'concentration', 'condition', 'consider', 'content', 'contrast', 'contribute', 'control', 'conventional', 'cool', 'coral', 'core', 'correlation', 'could', 'cover', 'crop', 'current', 'cycle', 'data', 'day', 'decrease', 'degree', 'derive', 'describe', 'determine', 'develop', 'development', 'difference', 'different', 'dioxide', 'distribution', 'dry', 'due', 'dynamic', 'earth', 'economic', 'ecosystem', 'effect', 'emission', 'energy', 'environmental', 'error', 'est

In [90]:
importances = forest.feature_importances_
# returns relative importance of all features.
# they are in the order of the columns
print(importances)

[1.07286552e-03 3.35070286e-04 3.38727582e-03 5.41238065e-04
 6.03306498e-04 1.26212575e-03 1.12532576e-03 5.92414634e-04
 5.84577829e-04 1.37921630e-03 9.24412461e-05 1.63382912e-03
 3.51293253e-03 2.77362585e-03 1.84497480e-03 0.00000000e+00
 4.73141862e-03 3.17997021e-04 1.48229955e-03 5.75246321e-04
 5.50302037e-03 1.77243504e-03 9.42777196e-03 1.02604524e-03
 5.76795997e-04 1.74629715e-03 0.00000000e+00 5.59027538e-03
 3.61769970e-03 0.00000000e+00 4.51572900e-03 9.57927130e-04
 6.92091750e-03 2.88358574e-03 2.60915120e-03 7.49701145e-04
 9.48247551e-04 2.22613278e-03 6.09361522e-04 0.00000000e+00
 1.27414217e-05 1.97423521e-02 0.00000000e+00 4.93603800e-03
 4.14690773e-03 1.06679407e-02 3.53294890e-03 3.28116903e-03
 9.63305230e-04 5.99769629e-03 3.76656004e-03 1.45817070e-02
 1.90109493e-04 3.36506986e-03 2.76668835e-03 4.72063575e-03
 3.17175940e-03 2.38506018e-03 1.26848703e-03 5.64677332e-04
 2.90763340e-03 7.06042510e-04 7.92153339e-03 2.53249936e-03
 2.43009089e-03 6.265309

In [39]:
train_bag = vectorizer.transform(X_train) #transform to a feature matrix
test_bag = vectorizer.transform(X_test)

In [40]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 50) 

In [41]:
%%time
forest = forest.fit(train_bag, y_train)

CPU times: user 66.8 ms, sys: 1.92 ms, total: 68.7 ms
Wall time: 68.2 ms


In [42]:
train_predictions = forest.predict(train_bag)
valid_predictions = forest.predict(test_bag)

In [43]:
metrics.accuracy_score(y_train,train_predictions)

1.0

In [44]:
metrics.accuracy_score(y_test,valid_predictions)

0.775

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics # for confusion matrix, accuracy score etc
from sklearn.model_selection import train_test_split

X_train1, X_test1, y_train1, y_test1 = train_test_split(\
    abstract_clean_wnl, balancedSet['sentiment'], random_state=0, test_size=.2)
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 300)

In [79]:
%%time
vectorizer.fit(X_train1)

CPU times: user 30.9 ms, sys: 1.7 ms, total: 32.6 ms
Wall time: 31.3 ms


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=300, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [80]:
print(vectorizer.get_feature_names()[:])

['10', '100', '16', '20', '2000', '2004', '30', '40', 'accumulation', 'activity', 'addition', 'affect', 'air', 'al', 'also', 'although', 'among', 'amount', 'analysis', 'analyze', 'annual', 'anthropogenic', 'appear', 'application', 'approach', 'arctic', 'area', 'assessment', 'associate', 'atlantic', 'atmosphere', 'atmospheric', 'average', 'balance', 'base', 'basin', 'biomass', 'carbon', 'case', 'cause', 'century', 'ch4', 'change', 'china', 'circulation', 'climate', 'climatic', 'cloud', 'co2', 'compare', 'comparison', 'component', 'concentration', 'condition', 'consider', 'content', 'contrast', 'contribute', 'control', 'conventional', 'cool', 'coral', 'core', 'correlation', 'could', 'cover', 'crop', 'current', 'cycle', 'data', 'day', 'decrease', 'degree', 'derive', 'describe', 'determine', 'develop', 'development', 'difference', 'different', 'dioxide', 'distribution', 'dry', 'due', 'dynamic', 'earth', 'economic', 'ecosystem', 'effect', 'emission', 'energy', 'environmental', 'error', 'est

In [81]:
train_bag = vectorizer.transform(X_train1) #transform to a feature matrix
test_bag = vectorizer.transform(X_test1)

In [82]:
metrics.accuracy_score(y_train1,train_predictions)

1.0

In [83]:
metrics.accuracy_score(y_test1,valid_predictions)

0.775

In [84]:
from sklearn.ensemble import RandomForestClassifier
def predict_sentiment(cleaned_reviews, y=balancedSet["sentiment"]):

    print("Creating the bag of words model!\n")
    # CountVectorizer" is scikit-learn's bag of words tool, here we show more keywords 
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 300) 
    
    X_train, X_test, y_train, y_test = train_test_split(\
    cleaned_reviews, y, random_state=0, test_size=.2)
    
    train_bag = vectorizer.fit_transform(X_train).toarray()
    test_bag = vectorizer.transform(X_test).toarray()

    # You can extract the vocabulary created by CountVectorizer
    # by running print(vectorizer.get_feature_names())


    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 75 trees
    forest = RandomForestClassifier(n_estimators = 50) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the target variable
    forest = forest.fit(train_bag, y_train)


    train_predictions = forest.predict(train_bag)
    test_predictions = forest.predict(test_bag)
    
    train_acc = metrics.accuracy_score(y_train, train_predictions)
    valid_acc = metrics.accuracy_score(y_test, test_predictions)
    print("The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)
    
    return(forest,vectorizer)

In [85]:
print('Original Reviews')
forest1,vec1 = predict_sentiment(abstract_clean_original)
print('Porter Stemmer')
forest2,vec2 = predict_sentiment(abstract_clean_ps)
print('Lemmatizing')
forest3,vec3 = predict_sentiment(abstract_clean_wnl)

Original Reviews
Creating the bag of words model!

Training the random forest classifier!

The training accuracy is:  1.0 
 The validation accuracy is:  0.7
Porter Stemmer
Creating the bag of words model!

Training the random forest classifier!

The training accuracy is:  1.0 
 The validation accuracy is:  0.65
Lemmatizing
Creating the bag of words model!

Training the random forest classifier!

The training accuracy is:  1.0 
 The validation accuracy is:  0.625
