In [7]:
import re # regex
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [35]:
train  = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [36]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [37]:
# combine train and test sets before cleaning up data
combined_data = train.append(test, ignore_index=True) # If ignore_index = True, the resulting axis will be labeled 0, 1, …, n - 1.

In [11]:
# create a function to remove unnecessary word in tweets eg: twitter handle starts with '@'
def remove_pattern(input, pattern):
    r = re.findall(pattern, input)
    for i in r:
        input = re.sub(i, '', input)
    return input

The vectorize function is provided primarily for convenience, not for performance. The implementation is essentially a for loop.

In [68]:
# remove twttier handles (@user) and create a new column for cleaned up tweets
combined_data['processed_tweet'] = np.vectorize(remove_pattern)(combined_data['tweet'], '@[\w]*')

The vectorized version of the function takes a sequence of objects or NumPy arrays as input and evaluates the Python function over each element of the input sequence. 

In [23]:
# example of using np.vectorize()
def myfunc(a, b):
    "Return a-b if a>b, otherwise return a+b"
    if a > b:
        return a - b
    else:
        return a + b
vfunc = np.vectorize(myfunc)
vfunc([1, 2, 3, 4], 2)


array([3, 4, 1, 2])

In [70]:

combined_data.head()

Unnamed: 0,id,label,tweet,processed_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


In [71]:
# replace everything except normal characters and hashtags, with spaces
combined_data['processed_tweet'] = combined_data['processed_tweet'].str.replace("[^a-zA-Z#]", " ")

  combined_data['processed_tweet'] = combined_data['processed_tweet'].str.replace("[^a-zA-Z#]", " ")


In [72]:
combined_data.head()

Unnamed: 0,id,label,tweet,processed_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide society now #motivation


In [73]:
# remove short words: having 3 or less chars
combined_data['processed_tweet'] = combined_data['processed_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

In [74]:
combined_data.head()

Unnamed: 0,id,label,tweet,processed_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause they offer wheelchai...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation


In [75]:
# tokenization cleaned tweets (split a tweet into individual words aka tokens)
tokenized_tweet = combined_data['processed_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: processed_tweet, dtype: object

In [76]:
# stem each token aka strip suffixes such as 'ing', 'es', 'ly', 'er'
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_tweet.head()

0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, caus, they, offer, whee...
2                              [bihday, your, majesti]
3                     [#model, love, take, with, time]
4                         [factsguid, societi, #motiv]
Name: processed_tweet, dtype: object

In [77]:
# join tokens back together as a full tweet
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combined_data['processed_tweet'] = tokenized_tweet

In [78]:
combined_data

Unnamed: 0,id,label,tweet,processed_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunct selfish drag kid into dys...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit caus they offer wheelchair ...
2,3,0.0,bihday your majesty,bihday your majesti
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguid societi #motiv
...,...,...,...,...
49154,49155,,thought factory: left-right polarisation! #tru...,thought factori left right polaris #trump #use...
49155,49156,,feeling like a mermaid ð #hairflip #neverre...,feel like mermaid #hairflip #neverreadi #forma...
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...,#hillari #campaign today #ohio use word like a...
49157,49158,,"happy, at work conference: right mindset leads...",happi work confer right mindset lead cultur de...
