# Climate Change EDSA challenge

### Import required packages

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import urllib

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mbongenimlotha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mbongenimlotha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Read in the data

In [2]:
train_df = pd.read_csv(r'train.csv')
test_df = pd.read_csv(r'test.csv')

In [3]:
train_df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
test_df.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [5]:
train_df.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [6]:
train_df['message'][0]

"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable"

In [7]:
punc_removed=[]
for message in train_df['message']:
    no_punc = ''.join([x for x in message.lower() if x not in string.punctuation])
    punc_removed.append(no_punc)

In [10]:
train_df['message'] = punc_removed

In [11]:
train_df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dio...,625221
1,1,its not like we lack evidence of anthropogenic...,126103
2,2,rt rawstory researchers say we have three year...,698562
3,1,todayinmaker wired 2016 was a pivotal year in...,573736
4,1,rt soynoviodetodas its 2016 and a racist sexis...,466954


In [16]:
# define stemmer function
stemmer = SnowballStemmer('english')

# tokenise data
tokeniser = TreebankWordTokenizer()
tokens=[]
for i in train_df['message']:
    tokens.append(tokeniser.tokenize(i))

# define lemmatiser
lemmatizer = WordNetLemmatizer()

In [19]:
train_df['tokens'] = tokens

In [20]:
train_df.head()

Unnamed: 0,sentiment,message,tweetid,tokens
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,"[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,its not like we lack evidence of anthropogenic...,126103,"[its, not, like, we, lack, evidence, of, anthr..."
2,2,rt rawstory researchers say we have three year...,698562,"[rt, rawstory, researchers, say, we, have, thr..."
3,1,todayinmaker wired 2016 was a pivotal year in...,573736,"[todayinmaker, wired, 2016, was, a, pivotal, y..."
4,1,rt soynoviodetodas its 2016 and a racist sexis...,466954,"[rt, soynoviodetodas, its, 2016, and, a, racis..."


In [24]:
# remove stopwords
tokens_less_stopwords = [word for i in train_df['tokens'] for word in i if word not in stopwords.words('english')]

In [26]:
tokens_less_stopwords

['polyscimajor',
 'epa',
 'chief',
 'doesnt',
 'think',
 'carbon',
 'dioxide',
 'main',
 'cause',
 'global',
 'warming',
 'wait',
 'httpstcoyelvcefxkc',
 'via',
 'mashable',
 'like',
 'lack',
 'evidence',
 'anthropogenic',
 'global',
 'warming',
 'rt',
 'rawstory',
 'researchers',
 'say',
 'three',
 'years',
 'act',
 'climate',
 'change',
 '’',
 'late',
 'httpstcowdt0kdur2f',
 'httpstcoz0anpt…',
 'todayinmaker',
 'wired',
 '2016',
 'pivotal',
 'year',
 'war',
 'climate',
 'change',
 'httpstco44wotxtlcd',
 'rt',
 'soynoviodetodas',
 '2016',
 'racist',
 'sexist',
 'climate',
 'change',
 'denying',
 'bigot',
 'leading',
 'polls',
 'electionnight',
 'worth',
 'read',
 'whether',
 'dont',
 'believe',
 'climate',
 'change',
 'httpstcogglzvnyjun',
 'httpstco7afe2mah8j',
 'rt',
 'thenation',
 'mike',
 'pence',
 '’',
 'believe',
 'global',
 'warming',
 'smoking',
 'causes',
 'lung',
 'cancer',
 'httpstcogvwyaauu8r',
 'rt',
 'makeandmendlife',
 'six',
 'big',
 'things',
 'today',
 'fight',
 'cli