# Climate Change EDSA challenge

### Import required packages

In [1]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import TreebankWordTokenizer, SnowballStemmer, PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import train_test_split
from sklearn import metrics 


import string
import urllib

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mbongenimlotha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mbongenimlotha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Read in the data

In [2]:
train_df = pd.read_csv(r'train.csv')
test_df = pd.read_csv(r'test.csv')

In [3]:
train_df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
sentiment    15819 non-null int64
message      15819 non-null object
tweetid      15819 non-null int64
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


Looking at the label:

In [5]:
train_df.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

Since our dataset is quite imbalanced, we would like to do some sampling so that our model won't overfit:

In [28]:
max_size = train_df['sentiment'].value_counts().max()

In [30]:
lst = [train_df]
for class_index, group in train_df.groupby('sentiment'):
    lst.append(group.sample(max_size - len(group), replace=True))
train_df_new = pd.concat(lst)

Now we have a more balanced dataset:

In [33]:
train_df_new['sentiment'].value_counts()

-1    8530
 2    8530
 1    8530
 0    8530
Name: sentiment, dtype: int64

Looking at our test dataset:

In [6]:
test_df.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10546 entries, 0 to 10545
Data columns (total 2 columns):
message    10546 non-null object
tweetid    10546 non-null int64
dtypes: int64(1), object(1)
memory usage: 164.9+ KB


We combine our train and test datasets for the purposes of perprocessing: 

In [34]:
full_df = train_df_new.append(test_df, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [51]:
n = train_df_new.shape[0]
full_df[n:]

Unnamed: 0,message,sentiment,tweetid,clean_message
34120,Europe will now be looking to China to make su...,,169760,europ will now look china make sure that not a...
34121,Combine this with the polling of staffers re c...,,35326,combin thi with the poll staffer climat chang ...
34122,"The scary, unimpeachable evidence that climate...",,224985,the scari unimpeach evid that climat chang alr...
34123,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,,476263,putin got you too jill trump doesn believ clim...
34124,RT @FakeWillMoore: 'Female orgasms cause globa...,,872928,femal orgasm caus global warm sarcast republican
...,...,...,...,...
44661,"RT @BrittanyBohrer: Brb, writing a poem about ...",,895714,brb write poem about climat chang climatechang...
44662,2016: the year climate change came home: Durin...,,875167,the year climat chang came home dure the hotte...
44663,RT @loop_vanuatu: Pacific countries positive a...,,78329,pacif countri posit about fiji lead the global...
44664,"RT @xanria_00018: You’re so hot, you must be t...",,867455,you hot you must the caus for global warm aldu...


In [36]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44666 entries, 0 to 44665
Data columns (total 3 columns):
message      44666 non-null object
sentiment    34120 non-null float64
tweetid      44666 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.0+ MB


### Pre-Processing

Lets have a look at what a typical tweet looks like to get an idea of what we can expect to see from the other tweets:

In [37]:
full_df['message'][0]

"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable"

Looking at the tweet above, we see that it contains the following:
- Punctuation
- Capital letters
- Special characters
- web link/url

We can expect to find the listed items in all the tweets and we need to remove these to clean our tweets.

We start cleaning our data by removing patterns we expect to find in our tweets:

In [38]:
def remove_pattern(text, pattern):
    """This function removes patters within
    text"""
    r = re.findall(pattern, text)
    for i in r:
        text = re.sub(i, '', text)
    return text

In [39]:
# Remove twitter handles 
handle_pattern = '@[\w]*'
full_df['clean_message'] = np.vectorize(remove_pattern)(full_df['message'], handle_pattern)

#remove urls
url_pattern = 'https?://[A-Za-z./]*'
full_df['clean_message'] = np.vectorize(remove_pattern)(full_df['clean_message'], url_pattern)

In [40]:
# Remove special characters, numbers, punctuations
pattern = '[^a-zA-Z#]'
full_df['clean_message'] = full_df['clean_message'].str.replace(pattern, " ")

full_df['clean_message'] = full_df['clean_message'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))

In [41]:
full_df.head()

Unnamed: 0,message,sentiment,tweetid,clean_message
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221,PolySciMajor EPA chief doesn think carbon diox...
1,It's not like we lack evidence of anthropogeni...,1.0,126103,not like lack evidence anthropogenic global wa...
2,RT @RawStory: Researchers say we have three ye...,2.0,698562,Researchers say have three years act climate c...
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736,#TodayinMaker# WIRED was pivotal year the war ...
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954,and racist sexist climate change denying bigot...


Cool, so now we've removed everything we don't want in our tweets, but we see that some special characters have slipped through. Let's see how we can deal with that:

In [42]:
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [43]:
full_df['clean_message'] = full_df['clean_message'].apply(remove_punctuation)

In [44]:
full_df.head()

Unnamed: 0,message,sentiment,tweetid,clean_message
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221,PolySciMajor EPA chief doesn think carbon diox...
1,It's not like we lack evidence of anthropogeni...,1.0,126103,not like lack evidence anthropogenic global wa...
2,RT @RawStory: Researchers say we have three ye...,2.0,698562,Researchers say have three years act climate c...
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736,TodayinMaker WIRED was pivotal year the war cl...
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954,and racist sexist climate change denying bigot...


Nooiiiccee! Seems as if we've gotten some of those stubborn special characters.

#### Tokenization

Now we want to tokenise our cleaned tweets and make it ready for analysis:

In [45]:
tokeniser = TreebankWordTokenizer()
full_df['clean_message'] = full_df['clean_message'].apply(tokeniser.tokenize)

In [46]:
full_df['clean_message'].head()

0    [PolySciMajor, EPA, chief, doesn, think, carbo...
1    [not, like, lack, evidence, anthropogenic, glo...
2    [Researchers, say, have, three, years, act, cl...
3    [TodayinMaker, WIRED, was, pivotal, year, the,...
4    [and, racist, sexist, climate, change, denying...
Name: clean_message, dtype: object

#### Stemming

Through stemming, we basically transform the words we have in our tweets to their root words:

In [47]:
stemmer = PorterStemmer()

full_df['clean_message'] = full_df['clean_message'].apply(lambda x: [stemmer.stem(i) for i in x])
full_df['clean_message']

0        [polyscimajor, epa, chief, doesn, think, carbo...
1        [not, like, lack, evid, anthropogen, global, w...
2        [research, say, have, three, year, act, climat...
3        [todayinmak, wire, wa, pivot, year, the, war, ...
4        [and, racist, sexist, climat, chang, deni, big...
                               ...                        
44661    [brb, write, poem, about, climat, chang, clima...
44662    [the, year, climat, chang, came, home, dure, t...
44663    [pacif, countri, posit, about, fiji, lead, the...
44664    [you, hot, you, must, the, caus, for, global, ...
44665    [climat, chang, global, issu, that, onli, get,...
Name: clean_message, Length: 44666, dtype: object

#### Lemmatization

We use lemmatization to group words of similar meaning together:

In [48]:
lemmatizer = WordNetLemmatizer()

full_df['clean_message'] = full_df['clean_message'].apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
full_df['clean_message']

0        [polyscimajor, epa, chief, doesn, think, carbo...
1        [not, like, lack, evid, anthropogen, global, w...
2        [research, say, have, three, year, act, climat...
3        [todayinmak, wire, wa, pivot, year, the, war, ...
4        [and, racist, sexist, climat, chang, deni, big...
                               ...                        
44661    [brb, write, poem, about, climat, chang, clima...
44662    [the, year, climat, chang, came, home, dure, t...
44663    [pacif, countri, posit, about, fiji, lead, the...
44664    [you, hot, you, must, the, caus, for, global, ...
44665    [climat, chang, global, issu, that, onli, get,...
Name: clean_message, Length: 44666, dtype: object

Joining the words of each tweet together again:

In [49]:
for i in range(len(full_df['clean_message'])):
    full_df['clean_message'][i] = ' '.join(full_df['clean_message'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Text feature extraction

### Bag-of-words

So now we want to transform what is essentially a list of words into a feature set that is usable by a classifier. 

In [24]:
def bag_of_words_count(words, word_dict={}):
    """ this function takes in a list of words and returns a dictionary 
        with each word as a key, and the value represents the number of 
        times that word appeared"""
    for word in words:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

In [229]:
tokens = full_df['clean_message'].apply(tokeniser.tokenize)

# remove stopwords
tokens_less_stopwords = [word for i in tokens for word in i if word not in stopwords.words('english')]

# create bag of words
bag_of_words = bag_of_words_count(tokens_less_stopwords)

In [23]:
bag_of_words[:5]

NameError: name 'bag_of_words' is not defined

### Train Test Split

Spliting our full dataframe back into their origional train and test dataframes:

In [52]:
df_train = full_df[:n]
df_test = full_df[n:]
X = df_train['clean_message']
y = df_train['sentiment'].astype(int)

#### Vectorization

In [53]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

#### Train-test split:

In [54]:
X_train,X_test,y_train,y_test = train_test_split(X_vectorized.toarray(),y,test_size=.25, random_state=11)

### Model training and Predictions

In [58]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)



In [59]:
metrics.f1_score(y_test, rfc_pred, average="macro")

0.9311042840373558

In [56]:
lsvc = LinearSVC(C=0.8, verbose=0)
lsvc.fit(X_train,  y_train)
lsvc_pred = lsvc.predict(X_test)

In [57]:
metrics.f1_score(y_test, lsvc_pred, average="macro")

0.9424828032792862

The Linear SVC model seems to produce the highest macro f1 score, lets see if we can't improve this:

In [60]:
from sklearn.metrics import classification_report, confusion_matrix 
print(classification_report(y_test, lsvc_pred)) 

              precision    recall  f1-score   support

          -1       0.97      0.99      0.98      2131
           0       0.94      0.97      0.96      2133
           1       0.94      0.85      0.89      2150
           2       0.92      0.96      0.94      2116

    accuracy                           0.94      8530
   macro avg       0.94      0.94      0.94      8530
weighted avg       0.94      0.94      0.94      8530



### Preparing our Test dataset 

In [61]:
x_test = df_test['clean_message']

In [62]:
X_test_vectorized = vectorizer.transform(x_test)

In [63]:
df_test['sentiment'] = lsvc.predict(X_test_vectorized)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [64]:
df_test[['tweetid', 'sentiment']].to_csv('test_predict.csv',index=False)