In [1]:
#import libraries
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.pipeline import Pipeline
import pickle

#Preprocessing
#import contractions
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split#

In [2]:
#Import data
test = pd.read_csv("test_with_no_labels.csv")
train = pd.read_csv("train.csv")
sample = pd.read_csv("sample_submission.csv")

In [3]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [5]:
sample.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,1


### Data Preprocessing

In [6]:
#Data Preprocessing
#Identifying missing values and data types
train.isna().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   sentiment  15819 non-null  bool 
 1   message    15819 non-null  bool 
 2   tweetid    15819 non-null  bool 
dtypes: bool(3)
memory usage: 46.5 KB


In [7]:
test.isna().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10546 entries, 0 to 10545
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   message  10546 non-null  bool 
 1   tweetid  10546 non-null  bool 
dtypes: bool(2)
memory usage: 20.7 KB


In [15]:
# Part of Speech for modeling
def POS(word):
    pos_counts = Counter()
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts["n"] = len([i for i in probable_part_of_speech if i.pos()=="n"])
    pos_counts["v"] = len([i for i in probable_part_of_speech if i.pos()=="v"])
    pos_counts["a"] = len([i for i in probable_part_of_speech if i.pos()=="a"])
    pos_counts["r"] = len([i for i in probable_part_of_speech if i.pos()=="r"])
    part_of_speech = pos_counts.most_common(1)[0][0]
    return part_of_speech

**Extract Useful Data**

In [16]:
def extractor(df):
    # Extract hashtags
    tweets = df['message']
    df['hashtags'] = df.message. str.lower().str.findall(r'#.*?(?=\s|$)')
    htags = df['hashtags']
    df['hashtags'] = htags.apply(lambda x: np.nan if len(x) == 0 else x)
    
    # Extract mentions
    df['mentions'] = df.message. str.lower().str.findall(r'@\w*')
    mtags = df['mentions']
    df['mentions'] = mtags.apply(lambda x: np.nan if len(x) == 0 else x)
    
    # Extract url
    df['url'] = df.message. str.lower().str.findall(r'http\S+|www.\S+')
    urltags = df['url']
    df['url'] = urltags.apply(lambda x: np.nan if len(x) == 0 else x)
    
    return df

In [17]:
extractor(train)

Unnamed: 0,sentiment,message,tweetid,hashtags,mentions,url
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,,[@mashable],[https://t.co/yelvcefxkc]
1,1,It's not like we lack evidence of anthropogeni...,126103,,,
2,2,RT @RawStory: Researchers say we have three ye...,698562,,[@rawstory],"[https://t.co/wdt0kdur2f, https://t.co/z0anpt‚Ä¶]"
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,[#todayinmaker#],,[https://t.co/44wotxtlcd]
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,[#electionnight],[@soynoviodetodas],
...,...,...,...,...,...,...
15814,1,RT @ezlusztig: They took down the material on ...,22001,,[@ezlusztig],[https://‚Ä¶]
15815,2,RT @washingtonpost: How climate change could b...,17856,,[@washingtonpost],[https://t.co/rpfgvb2plq]
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248,,,[https://t.co/0mp2]
15817,-1,RT @sara8smiles: Hey liberals the climate chan...,819732,[#agenda2030.],[@sara8smiles],


**Remove Unnecessary Information on Message**

In [27]:
# Remove URLs
train['message'] = train['message'].str.replace('http\S+|www.\S+', '', regex=True)

# Remove mentions
train['message'] = train['message'].str.replace('@\w*', '', regex=True)

# Removal hashtags
train['message'] = train['message'].str.replace('#.*?(?=\s|$)', '', regex=True)

# Remove 'RT'
train['message'] = train['message'].str.replace('RT', '', regex=True)

# Remove stopwords
train.head()

Unnamed: 0,sentiment,message,tweetid,hashtags,mentions,url
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,,[@mashable],[https://t.co/yelvcefxkc]
1,1,It's not like we lack evidence of anthropogeni...,126103,,,
2,2,: Researchers say we have three years to act ...,698562,,[@rawstory],"[https://t.co/wdt0kdur2f, https://t.co/z0anpt‚Ä¶]"
3,1,WIRED : 2016 was a pivotal year in the war on...,573736,[#todayinmaker#],,[https://t.co/44wotxtlcd]
4,1,": It's 2016, and a racist, sexist, climate ch...",466954,[#electionnight],[@soynoviodetodas],


In [28]:
# Clean tweets
def clean_tweets(df):
    # tokenizing the tweets
    clean_tweets = df['message'].apply(TweetTokenizer().tokenize)

    # remove punctuation
    clean_tweets = clean_tweets.apply(lambda x : [token for token in x if token not in string.punctuation])

    # removing digits from the tweets
    clean_tweets = clean_tweets.apply(lambda x: [token for token in x if token not in list(string.digits)])

    # lastly we remove all one character tokens
    clean_tweets = clean_tweets.apply(lambda x: [token for token in x if len(token) > 1])
    
    df['cleaned_tweets'] = clean_tweets
    
    return df['cleaned_tweets']

In [29]:
clean_tweets(train)

0        [PolySciMajor, EPA, chief, doesn't, think, car...
1        [It's, not, like, we, lack, evidence, of, anth...
2        [Researchers, say, we, have, three, years, to,...
3        [WIRED, 2016, was, pivotal, year, in, the, war...
4        [It's, 2016, and, racist, sexist, climate, cha...
                               ...                        
15814    [They, took, down, the, material, on, global, ...
15815    [How, climate, change, could, be, breaking, up...
15816    [notiven, nytimesworld, What, does, Trump, act...
15817    [Hey, liberals, the, climate, change, crap, is...
15818         [climate, change, equation, in, screenshots]
Name: cleaned_tweets, Length: 15819, dtype: object

In [30]:
#Removing Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

stop = stopwords.words('english')

# Convert to lower case
train['cleaned_tweets'] = train['cleaned_tweets'].apply(lambda x: [word.lower() for word in x])

# Remove stopwords
train['no_stopwords'] = train['cleaned_tweets'].apply(lambda x: [item for item in x if item not in stop])

train.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lehut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,sentiment,message,tweetid,hashtags,mentions,url,cleaned_tweets,no_stopwords
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,,[@mashable],[https://t.co/yelvcefxkc],"[polyscimajor, epa, chief, doesn't, think, car...","[polyscimajor, epa, chief, think, carbon, diox..."
1,1,It's not like we lack evidence of anthropogeni...,126103,,,,"[it's, not, like, we, lack, evidence, of, anth...","[like, lack, evidence, anthropogenic, global, ..."
2,2,: Researchers say we have three years to act ...,698562,,[@rawstory],"[https://t.co/wdt0kdur2f, https://t.co/z0anpt‚Ä¶]","[researchers, say, we, have, three, years, to,...","[researchers, say, three, years, act, climate,..."
3,1,WIRED : 2016 was a pivotal year in the war on...,573736,[#todayinmaker#],,[https://t.co/44wotxtlcd],"[wired, 2016, was, pivotal, year, in, the, war...","[wired, 2016, pivotal, year, war, climate, cha..."
4,1,": It's 2016, and a racist, sexist, climate ch...",466954,[#electionnight],[@soynoviodetodas],,"[it's, 2016, and, racist, sexist, climate, cha...","[2016, racist, sexist, climate, change, denyin..."


In [32]:
#Most Frequent Words
from collections import Counter
cnt = Counter()
for message in train['no_stopwords'].values:
    for word in message:
        cnt[word] += 1
        

In [35]:
#Map the sentiment class by name
train['sent_labels']  = train['sentiment'].map({-1: 'Anti',0:'Neutral', 1:'Pro', 2:'News'})

In [37]:
#The length of each raw tweet
train['text_length'] = train['message'].apply(lambda x: len(x))

**Separate Datframes of Tweets for each Sentiment**

In [38]:
# Dataframe for neutral tweets
neutral_df = train[train['sentiment'] == 0]
neutral_df.head()

Unnamed: 0,sentiment,message,tweetid,hashtags,mentions,url,cleaned_tweets,no_stopwords,sent_labels,text_length
19,0,Calum: *tweets abt reunitingish w the cast*\n-...,547924,,,,"[calum, tweets, abt, reunitingish, the, cast, ...","[calum, tweets, abt, reunitingish, cast, sees,...",Neutral,138
22,0,"we also met this guy, he let us in on some tru...",67545,,,[https://t.co/q7yomcmzaj],"[we, also, met, this, guy, he, let, us, in, on...","[also, met, guy, let, us, truth, climate, chan...",Neutral,98
30,0,are these the same scientists that denounce c...,365051,,[@jnp_ftw],,"[are, these, the, same, scientists, that, deno...","[scientists, denounce, climate, change, choice]",Neutral,78
39,0,We‚Äô ve dealt with simple issues like climate c...,403368,[#qanda],,,"[we, ve, dealt, with, simple, issues, like, cl...","[dealt, simple, issues, like, climate, change,...",Neutral,109
43,0,: Win probability is bullshit man. I saw the ...,326916,,[@andrewsharp],,"[win, probability, is, bullshit, man, saw, the...","[win, probability, bullshit, man, saw, nba, fi...",Neutral,106


In [39]:
# Dataframe for pro tweets
pro_df = train[train['sentiment'] == 1]
pro_df.head()

Unnamed: 0,sentiment,message,tweetid,hashtags,mentions,url,cleaned_tweets,no_stopwords,sent_labels,text_length
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,,[@mashable],[https://t.co/yelvcefxkc],"[polyscimajor, epa, chief, doesn't, think, car...","[polyscimajor, epa, chief, think, carbon, diox...",Pro,108
1,1,It's not like we lack evidence of anthropogeni...,126103,,,,"[it's, not, like, we, lack, evidence, of, anth...","[like, lack, evidence, anthropogenic, global, ...",Pro,62
3,1,WIRED : 2016 was a pivotal year in the war on...,573736,[#todayinmaker#],,[https://t.co/44wotxtlcd],"[wired, 2016, was, pivotal, year, in, the, war...","[wired, 2016, pivotal, year, war, climate, cha...",Pro,62
4,1,": It's 2016, and a racist, sexist, climate ch...",466954,[#electionnight],[@soynoviodetodas],,"[it's, 2016, and, racist, sexist, climate, cha...","[2016, racist, sexist, climate, change, denyin...",Pro,90
5,1,Woh a read whether you do or don't believe in ...,425577,,,"[https://t.co/gglzvnyjun, https://t.co/7afe2ma...","[woh, read, whether, you, do, or, don't, belie...","[woh, read, whether, believe, climate, change]",Pro,62


In [40]:
# Dataframe for anti tweets
anti_df = train[train['sentiment'] == -1]
anti_df.head()

Unnamed: 0,sentiment,message,tweetid,hashtags,mentions,url,cleaned_tweets,no_stopwords,sent_labels,text_length
28,-1,Sally Kohn‚Äôs latest evidence of climate change...,355491,,[@twitchyteam],[https://t.co/mhkzogl9vt],"[sally, kohn, latest, evidence, of, climate, c...","[sally, kohn, latest, evidence, climate, chang...",Anti,102
46,-1,Carbon Tax is a Globalist idea to enslave the...,61141,,[@realdonaldtrump],,"[carbon, tax, is, globalist, idea, to, enslave...","[carbon, tax, globalist, idea, enslave, world'...",Anti,121
48,-1,: We had winds close to 100 MPH in the area t...,719523,,[@stevesgoddard],,"[we, had, winds, close, to, 100, mph, in, the,...","[winds, close, 100, mph, area, afternoon, woul...",Anti,120
56,-1,lmao üòÇ snowflakes ‚ùÑÔ∏è complaining about snowfl...,911385,,[@misslizzynj],,"[lmao, snowflakes, complaining, about, snowfla...","[lmao, snowflakes, complaining, snowflakes, wi...",Anti,84
57,-1,: This is ONE of Arnold Schwarzenegger's vehi...,768263,,[@dawn2334dawn],[http‚Ä¶],"[this, is, one, of, arnold, schwarzenegger's, ...","[one, arnold, schwarzenegger's, vehicles, whin...",Anti,120


In [41]:
# Dataframe for news tweets
news_df = train[train['sentiment'] == 2]
news_df.head()

Unnamed: 0,sentiment,message,tweetid,hashtags,mentions,url,cleaned_tweets,no_stopwords,sent_labels,text_length
2,2,: Researchers say we have three years to act ...,698562,,[@rawstory],"[https://t.co/wdt0kdur2f, https://t.co/z0anpt‚Ä¶]","[researchers, say, we, have, three, years, to,...","[researchers, say, three, years, act, climate,...",News,86
12,2,: We only have a 5 percent chance of avoiding...,454673,,[@tveitdal],"[https://t.co/xubtqnxhkk, https://t.co/of‚Ä¶]","[we, only, have, percent, chance, of, avoiding...","[percent, chance, avoiding, dangerous, global,...",News,90
14,2,Fossil fuel giant ExxonMobil ‚Äòmisled‚Äô the publ...,658092,,,[https://t.co/ofc2wsu4ex],"[fossil, fuel, giant, exxonmobil, misled, the,...","[fossil, fuel, giant, exxonmobil, misled, publ...",News,98
26,2,Bangladesh confronting climate change head on,365291,,,"[https://t.co/mtqenbqdut, https://t.co/itgkuxg...","[bangladesh, confronting, climate, change, hea...","[bangladesh, confronting, climate, change, head]",News,47
32,2,: Atmospheric rivers fueled by climate change...,143471,,[@latimes],"[https://t.co/p0lzbhlu5k, https://t‚Ä¶]","[atmospheric, rivers, fueled, by, climate, cha...","[atmospheric, rivers, fueled, climate, change,...",News,97
