In [15]:
#importing the package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Menzi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Menzi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#importing the data
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,RT @darreljorstad: Funny as hell! Canada deman...,897853122080407553
1,-1,All the biggest lies about climate change and ...,925046776553529344
2,-1,The Coming Revelation Of The $q$Global Warming...,696354236850786305
3,-1,RT @DineshDSouza: Let's see if the world ends ...,846806509732483072
4,-1,RT @SteveSGoddard: Obama has no control over t...,628085266293653504


In [5]:
#checking columns
data.columns

Index(['sentiment', 'message', 'tweetid'], dtype='object')

In [6]:
data.shape

(30759, 3)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30759 entries, 0 to 30758
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  30759 non-null  int64 
 1   message    30759 non-null  object
 2   tweetid    30759 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 721.0+ KB


In [8]:
data.describe()

Unnamed: 0,sentiment,tweetid
count,30759.0,30759.0
mean,0.853929,8.36433e+17
std,0.853551,8.519986e+16
min,-1.0,5.926334e+17
25%,0.0,7.970301e+17
50%,1.0,8.401516e+17
75%,1.0,8.996975e+17
max,2.0,9.666876e+17


In [9]:
data[['message']].describe()

Unnamed: 0,message
count,30759
unique,28813
top,RT @StephenSchlegel: she's thinking about how ...
freq,361


In [10]:
data.nunique()

sentiment        4
message      28813
tweetid      30759
dtype: int64

### Data Cleaning

#### Removing Twitter Handles

In [13]:
def remove_twitter_handles(tweet, pattern):
    r = re.findall(pattern, tweet)
    for text in r:
        tweet = re.sub(text, '', tweet)
    return tweet

data['clean_tweet'] = np.vectorize(remove_twitter_handles)(data['message'], "@[\w]*") 

In [14]:
data.head()

Unnamed: 0,sentiment,message,tweetid,clean_tweet
0,-1,RT @darreljorstad: Funny as hell! Canada deman...,897853122080407553,RT : Funny as hell! Canada demands 'gender rig...
1,-1,All the biggest lies about climate change and ...,925046776553529344,All the biggest lies about climate change and ...
2,-1,The Coming Revelation Of The $q$Global Warming...,696354236850786305,The Coming Revelation Of The $q$Global Warming...
3,-1,RT @DineshDSouza: Let's see if the world ends ...,846806509732483072,RT : Let's see if the world ends when 's clim...
4,-1,RT @SteveSGoddard: Obama has no control over t...,628085266293653504,RT : Obama has no control over the climate. He...


Removing Stopwords

In [16]:
stop_words = nltk.corpus.stopwords.words('english')

In [17]:
data['tidy_tweet'] = data['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))

In [18]:
data.head()

Unnamed: 0,sentiment,message,tweetid,clean_tweet,tidy_tweet
0,-1,RT @darreljorstad: Funny as hell! Canada deman...,897853122080407553,RT : Funny as hell! Canada demands 'gender rig...,RT : Funny hell! Canada demands 'gender rights...
1,-1,All the biggest lies about climate change and ...,925046776553529344,All the biggest lies about climate change and ...,All biggest lies climate change global warming...
2,-1,The Coming Revelation Of The $q$Global Warming...,696354236850786305,The Coming Revelation Of The $q$Global Warming...,The Coming Revelation Of The $q$Global Warming...
3,-1,RT @DineshDSouza: Let's see if the world ends ...,846806509732483072,RT : Let's see if the world ends when 's clim...,RT : Let's see world ends 's climate change ro...
4,-1,RT @SteveSGoddard: Obama has no control over t...,628085266293653504,RT : Obama has no control over the climate. He...,RT : Obama control climate. He worst snake oil...


Text Normalization

In [19]:
def tokenizing(text):
    text = re.split('\W+', text)
    return text

data['tokenized_tweet'] = data['tidy_tweet'].apply(lambda x: tokenizing(x))

In [20]:
data.head()

Unnamed: 0,sentiment,message,tweetid,clean_tweet,tidy_tweet,tokenized_tweet
0,-1,RT @darreljorstad: Funny as hell! Canada deman...,897853122080407553,RT : Funny as hell! Canada demands 'gender rig...,RT : Funny hell! Canada demands 'gender rights...,"[RT, Funny, hell, Canada, demands, gender, rig..."
1,-1,All the biggest lies about climate change and ...,925046776553529344,All the biggest lies about climate change and ...,All biggest lies climate change global warming...,"[All, biggest, lies, climate, change, global, ..."
2,-1,The Coming Revelation Of The $q$Global Warming...,696354236850786305,The Coming Revelation Of The $q$Global Warming...,The Coming Revelation Of The $q$Global Warming...,"[The, Coming, Revelation, Of, The, q, Global, ..."
3,-1,RT @DineshDSouza: Let's see if the world ends ...,846806509732483072,RT : Let's see if the world ends when 's clim...,RT : Let's see world ends 's climate change ro...,"[RT, Let, s, see, world, ends, s, climate, cha..."
4,-1,RT @SteveSGoddard: Obama has no control over t...,628085266293653504,RT : Obama has no control over the climate. He...,RT : Obama control climate. He worst snake oil...,"[RT, Obama, control, climate, He, worst, snake..."


In [21]:
tokens = data['tokenized_tweet']

In [22]:
lemmatizer = WordNetLemmatizer()

In [23]:
tokens = tokens.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])

In [24]:
data['lemmatized_tweet'] = tokens

In [25]:
data.head()

Unnamed: 0,sentiment,message,tweetid,clean_tweet,tidy_tweet,tokenized_tweet,lemmatized_tweet
0,-1,RT @darreljorstad: Funny as hell! Canada deman...,897853122080407553,RT : Funny as hell! Canada demands 'gender rig...,RT : Funny hell! Canada demands 'gender rights...,"[RT, Funny, hell, Canada, demands, gender, rig...","[RT, Funny, hell, Canada, demand, gender, righ..."
1,-1,All the biggest lies about climate change and ...,925046776553529344,All the biggest lies about climate change and ...,All biggest lies climate change global warming...,"[All, biggest, lies, climate, change, global, ...","[All, biggest, lie, climate, change, global, w..."
2,-1,The Coming Revelation Of The $q$Global Warming...,696354236850786305,The Coming Revelation Of The $q$Global Warming...,The Coming Revelation Of The $q$Global Warming...,"[The, Coming, Revelation, Of, The, q, Global, ...","[The, Coming, Revelation, Of, The, q, Global, ..."
3,-1,RT @DineshDSouza: Let's see if the world ends ...,846806509732483072,RT : Let's see if the world ends when 's clim...,RT : Let's see world ends 's climate change ro...,"[RT, Let, s, see, world, ends, s, climate, cha...","[RT, Let, s, see, world, end, s, climate, chan..."
4,-1,RT @SteveSGoddard: Obama has no control over t...,628085266293653504,RT : Obama has no control over the climate. He...,RT : Obama control climate. He worst snake oil...,"[RT, Obama, control, climate, He, worst, snake...","[RT, Obama, control, climate, He, worst, snake..."


In [26]:
data = data.drop(['tidy_tweet'],axis=1)
data = data.drop(['tokenized_tweet'], axis=1)

In [27]:
data.head()

Unnamed: 0,sentiment,message,tweetid,clean_tweet,lemmatized_tweet
0,-1,RT @darreljorstad: Funny as hell! Canada deman...,897853122080407553,RT : Funny as hell! Canada demands 'gender rig...,"[RT, Funny, hell, Canada, demand, gender, righ..."
1,-1,All the biggest lies about climate change and ...,925046776553529344,All the biggest lies about climate change and ...,"[All, biggest, lie, climate, change, global, w..."
2,-1,The Coming Revelation Of The $q$Global Warming...,696354236850786305,The Coming Revelation Of The $q$Global Warming...,"[The, Coming, Revelation, Of, The, q, Global, ..."
3,-1,RT @DineshDSouza: Let's see if the world ends ...,846806509732483072,RT : Let's see if the world ends when 's clim...,"[RT, Let, s, see, world, end, s, climate, chan..."
4,-1,RT @SteveSGoddard: Obama has no control over t...,628085266293653504,RT : Obama has no control over the climate. He...,"[RT, Obama, control, climate, He, worst, snake..."


In [29]:
clean_data = data[['sentiment','lemmatized_tweet','tweetid']]

In [32]:
clean_data.to_csv("clean_data.csv",index=False)