# 1. Tweet Preprocessing and Cleaning


In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
#import data
train = pd.read_csv('Processed_data.csv')


In [6]:
#Test the dataset
train.head()

Unnamed: 0,Tweets,cleaned_tweet
0,Anonymous man with COVID threatens to go to ga...,Anonymous man with COVID threatens to go to ga...
1,#homophobia #dontbemean #SuicidePrevention,#homophobia #dontbemean #SuicidePrevention
2,"I still have yet to try this and never will, s...","I still have yet to try this and never will, s..."
3,"Expecting homophobic, transphobic or otherwise...","Expecting homophobic, transphobic or otherwise..."
4,#PositiveSexualHealthTuesday #TransAwarenessWe...,#PositiveSexualHealthTuesday #TransAwarenessWe...


In [4]:
#function for removing @user
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern,input_txt)
    for i in r:
        input_txt = re.sub(i, '',input_txt)
    return input_txt


In [5]:
#create new column wwith removed @user
train['cleaned_tweet'] = np.vectorize(remove_pattern)(train['Tweets'], '@[\w]*')

In [6]:
train.head(10)

Unnamed: 0,Tweets,cleaned_tweet
0,Anonymous man with COVID threatens to go to ga...,Anonymous man with COVID threatens to go to ga...
1,#homophobia #dontbemean #SuicidePrevention,#homophobia #dontbemean #SuicidePrevention
2,"I still have yet to try this and never will, s...","I still have yet to try this and never will, s..."
3,"Expecting homophobic, transphobic or otherwise...","Expecting homophobic, transphobic or otherwise..."
4,#PositiveSexualHealthTuesday #TransAwarenessWe...,#PositiveSexualHealthTuesday #TransAwarenessWe...
5,"@RealCandaceO Bitch, I have one word for you; ...","Bitch, I have one word for you; ..."
6,@jeff_kennett #MurdochGutterMedia is well repo...,#MurdochGutterMedia is well reported to influ...
7,Who's the more racist person? RT and feel free...,Who's the more racist person? RT and feel free...
8,I encourage my fellow parliamentarians to scru...,I encourage my fellow parliamentarians to scru...
9,@jeremycorbyn @jeremycorbyn did nothing to add...,did nothing to address #racism #homophobia i...


Any tweets with @... has been cleaned without the username @....

# Removing Punctuations, numbers and special characters

In [7]:
#remove special characters, numbers , punctuations

#removing space
train['cleaned_tweet'] = train['cleaned_tweet'].str.replace('[^A-Za-z0-9]+',' ')

In [8]:
train.head(10)

Unnamed: 0,Tweets,cleaned_tweet
0,Anonymous man with COVID threatens to go to ga...,Anonymous man with COVID threatens to go to ga...
1,#homophobia #dontbemean #SuicidePrevention,homophobia dontbemean SuicidePrevention
2,"I still have yet to try this and never will, s...",I still have yet to try this and never will so...
3,"Expecting homophobic, transphobic or otherwise...",Expecting homophobic transphobic or otherwise ...
4,#PositiveSexualHealthTuesday #TransAwarenessWe...,PositiveSexualHealthTuesday TransAwarenessWee...
5,"@RealCandaceO Bitch, I have one word for you; ...",Bitch I have one word for you The end homopho...
6,@jeff_kennett #MurdochGutterMedia is well repo...,MurdochGutterMedia is well reported to influe...
7,Who's the more racist person? RT and feel free...,Who s the more racist person RT and feel free ...
8,I encourage my fellow parliamentarians to scru...,I encourage my fellow parliamentarians to scru...
9,@jeremycorbyn @jeremycorbyn did nothing to add...,did nothing to address racism homophobia in w...


# Removing the short words.

In [9]:
# remove the short words ~ 
train['cleaned_tweet'] = train['cleaned_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

In [10]:
train.tail(10)

Unnamed: 0,Tweets,cleaned_tweet
133313,#tetapnormal,tetapnormal
133314,We do not even reject those who are involved w...,even reject those involved with LGBT fact many...
133315,God has created you the best of creation in ac...,created best creation accordance with nature c...
133316,Why is it necessary to normalize things that a...,necessary normalize things that clearly forbid...
133317,#tetapnormal Surah Al-Baqarah: verse 216 ..may...,tetapnormal Surah Baqarah verse maybe hate som...
133318,Are you really proud of that elbibiti flag? # ...,really proud that elbibiti flag StayNormal Proud
133319,We can be kind and gentle to help friends who ...,kind gentle help friends LGBT valley those fig...
133320,# TetapnormalðŸ ‡ ²ðŸ ‡ ¾,Tetapnormal
133321,Want to be extraordinary do not go against nat...,Want extraordinary against nature tetapnormal
133322,#TetapNormal,TetapNormal


# Tokenization

In [11]:
#create new variable tookenize tweet
tokenized_tweet = train['cleaned_tweet'].apply(lambda x: x.split())

In [12]:
tokenized_tweet

0         [Anonymous, with, COVID, threatens, bars, spre...
1               [homophobia, dontbemean, SuicidePrevention]
2         [still, have, this, never, will, someone, desc...
3         [Expecting, homophobic, transphobic, otherwise...
4         [PositiveSexualHealthTuesday, TransAwarenessWe...
                                ...                        
133318    [really, proud, that, elbibiti, flag, StayNorm...
133319    [kind, gentle, help, friends, LGBT, valley, th...
133320                                        [Tetapnormal]
133321    [Want, extraordinary, against, nature, tetapno...
133322                                        [TetapNormal]
Name: cleaned_tweet, Length: 133323, dtype: object

## Removing Stop Words

In [13]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
def filter_words(tokenized_data):
    filtered_data = []
    for (tokenized_words, sentiment) in tokenized_data:
        filtered_data.append(([stem_the_word(remove_punctuation(word.lower())) for word in tokenized_words if remove_punctuation(word.lower()) not in stop_words and remove_punctuation(word.lower()) != ''], sentiment))
    return filtered_data

In [14]:
train.head()

Unnamed: 0,Tweets,cleaned_tweet
0,Anonymous man with COVID threatens to go to ga...,Anonymous with COVID threatens bars spread inf...
1,#homophobia #dontbemean #SuicidePrevention,homophobia dontbemean SuicidePrevention
2,"I still have yet to try this and never will, s...",still have this never will someone describe ta...
3,"Expecting homophobic, transphobic or otherwise...",Expecting homophobic transphobic otherwise que...
4,#PositiveSexualHealthTuesday #TransAwarenessWe...,PositiveSexualHealthTuesday TransAwarenessWeek...


# Stemming

In [15]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

# apply stemmer for tokenized_tweets
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
#Saved in a list 

In [16]:
tokenized_tweet.tail(10)


133313                                          [tetapnorm]
133314    [even, reject, those, involv, with, lgbt, fact...
133315    [creat, best, creation, accord, with, natur, c...
133316    [necessari, normal, thing, that, clearli, forb...
133317    [tetapnorm, surah, baqarah, vers, mayb, hate, ...
133318    [realli, proud, that, elbib, flag, staynorm, p...
133319    [kind, gentl, help, friend, lgbt, valley, thos...
133320                                          [tetapnorm]
133321     [want, extraordinari, against, natur, tetapnorm]
133322                                          [tetapnorm]
Name: cleaned_tweet, dtype: object

In [17]:
# join tokens into one sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

#change train['cleaned_tweet'] to tokenized tweet
train['cleaned_tweet'] = tokenized_tweet

In [18]:
train.tail(5)

Unnamed: 0,Tweets,cleaned_tweet
133318,Are you really proud of that elbibiti flag? # ...,realli proud that elbib flag staynorm proud
133319,We can be kind and gentle to help friends who ...,kind gentl help friend lgbt valley those fight...
133320,# TetapnormalðŸ ‡ ²ðŸ ‡ ¾,tetapnorm
133321,Want to be extraordinary do not go against nat...,want extraordinari against natur tetapnorm
133322,#TetapNormal,tetapnorm


In [19]:
train.drop_duplicates(keep=False, inplace=True)

In [20]:
# Saved into a new csv files

submission = train[['cleaned_tweet']]
submission.to_csv('clean.csv', index = False)



The processed data that has been cleaned is saved into Clean.csv files.

The data will be visualize and sentiment analysis in the next part.

NameError: name 'tweets' is not defined