# Climate Change EDSA challenge

### Import required packages

In [75]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

import string
import urllib

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mbongenimlotha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mbongenimlotha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Read in the data

In [2]:
train_df = pd.read_csv(r'train.csv')
test_df = pd.read_csv(r'test.csv')

In [3]:
train_df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
test_df.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [5]:
train_df.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [35]:
train_df['message'][0]

'polyscimajor epa chief doesnt think carbon dioxide is main cause of global warming and wait what httpstcoyelvcefxkc via mashable'

In [62]:
punc_removed=[]
for message in train_df['message']:
    no_punc = ''.join([x for x in message.lower() if x not in string.punctuation])
    sentence=no_punc.replace('{html}',"")
    sentence=sentence.replace('rt',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_special = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ/]+', '', cleantext)
    rem_url=re.sub(r'http\S+', '',rem_special)
    rem_num = re.sub('[0-9]+', '', rem_url)
    rem_space = re.sub(r'@"\t|\n|\r', '', rem_num)
    rem_last = re.sub(r'ãââ', '', rem_space)
    punc_removed.append(rem_last)

In [63]:
punc_removed

['polyscimajor epa chief doesnt think carbon dioxide is main cause of global warming and wait what  via mashable',
 'its not like we lack evidence of anthropogenic global warming',
 ' rawstory researchers say we have three years to act on climate change before its too late  ',
 'todayinmaker wired   was a pivotal year in the war on climate change ',
 ' soynoviodetodas its  and a racist sexist climate change denying bigot is leading in the polls electionnight',
 'woh a read whether you do or dont believe in climate change  ',
 ' thenation mike pence doesnt believe in global warming or that smoking causes lung cancer ',
 ' makeandmendlife six big things we can all do today to fight climate change or how to be a climate activist  h',
 'aceofspadeshq my yo nephew is inconsolable he wants to die of old age like me but will perish in the fiery hellscape of climate change',
 ' paigetweedy no offense but like how do you just not believe in global warming',
 ' stephenschlegel shes thinking abou

In [64]:
train_df['message'] = punc_removed

In [65]:
train_df.head()

Unnamed: 0,sentiment,message,tweetid,tokens
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,"[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,its not like we lack evidence of anthropogenic...,126103,"[its, not, like, we, lack, evidence, of, anthr..."
2,2,rawstory researchers say we have three years ...,698562,"[rt, rawstory, researchers, say, we, have, thr..."
3,1,todayinmaker wired was a pivotal year in the...,573736,"[todayinmaker, wired, 2016, was, a, pivotal, y..."
4,1,soynoviodetodas its and a racist sexist clim...,466954,"[rt, soynoviodetodas, its, 2016, and, a, racis..."


In [66]:
# define stemmer function
stemmer = SnowballStemmer('english')

# tokenise data
tokeniser = TreebankWordTokenizer()
tokens=[]
for i in train_df['message']:
    tokens.append(tokeniser.tokenize(i))

# define lemmatiser
lemmatizer = WordNetLemmatizer()

In [67]:
train_df['tokens'] = tokens

In [68]:
train_df.head(100)

Unnamed: 0,sentiment,message,tweetid,tokens
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,"[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,its not like we lack evidence of anthropogenic...,126103,"[its, not, like, we, lack, evidence, of, anthr..."
2,2,rawstory researchers say we have three years ...,698562,"[rawstory, researchers, say, we, have, three, ..."
3,1,todayinmaker wired was a pivotal year in the...,573736,"[todayinmaker, wired, was, a, pivotal, year, i..."
4,1,soynoviodetodas its and a racist sexist clim...,466954,"[soynoviodetodas, its, and, a, racist, sexist,..."
...,...,...,...,...
95,1,ronaldklain as trump decides on paris briancd...,458845,"[ronaldklain, as, trump, decides, on, paris, b..."
96,1,savingoceans lack of climate change action ca...,695439,"[savingoceans, lack, of, climate, change, acti..."
97,1,yet another trump advisor is clueless on clima...,894382,"[yet, another, trump, advisor, is, clueless, o..."
98,1,stephenschlegel shes thinking about how shes ...,603318,"[stephenschlegel, shes, thinking, about, how, ..."


In [71]:
# remove stopwords
tokens_less_stopwords = [word for i in train_df['tokens'] for word in i if len(word) > 2 if word not in stopwords.words('english')]

In [72]:
tokens_less_stopwords

['polyscimajor',
 'epa',
 'chief',
 'doesnt',
 'think',
 'carbon',
 'dioxide',
 'main',
 'cause',
 'global',
 'warming',
 'wait',
 'via',
 'mashable',
 'like',
 'lack',
 'evidence',
 'anthropogenic',
 'global',
 'warming',
 'rawstory',
 'researchers',
 'say',
 'three',
 'years',
 'act',
 'climate',
 'change',
 'late',
 'todayinmaker',
 'wired',
 'pivotal',
 'year',
 'war',
 'climate',
 'change',
 'soynoviodetodas',
 'racist',
 'sexist',
 'climate',
 'change',
 'denying',
 'bigot',
 'leading',
 'polls',
 'electionnight',
 'woh',
 'read',
 'whether',
 'dont',
 'believe',
 'climate',
 'change',
 'thenation',
 'mike',
 'pence',
 'doesnt',
 'believe',
 'global',
 'warming',
 'smoking',
 'causes',
 'lung',
 'cancer',
 'makeandmendlife',
 'six',
 'big',
 'things',
 'today',
 'fight',
 'climate',
 'change',
 'climate',
 'activist',
 'aceofspadeshq',
 'nephew',
 'inconsolable',
 'wants',
 'die',
 'old',
 'age',
 'like',
 'perish',
 'fiery',
 'hellscape',
 'climate',
 'change',
 'paigetweedy',
 

Split x and y variable from the train dataset:

In [101]:
X = train_df['tokens']
y = train_df['sentiment']

In [105]:
from imblearn.over_sampling import SMOTE 
smt=SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

Make the text readable for the model:

In [106]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

AttributeError: 'list' object has no attribute 'lower'

Train Test split

In [104]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.3, random_state=11)

In [107]:
rfc = RandomForestClassifier()
rfc.fit(X_train_sm, y_train_sm)
rfc_pred = rfc.predict(X_val)



In [108]:
f1_score(y_val, rfc_pred, average="macro")

0.5866274461509186