In [57]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [58]:
train_samples=pd.read_csv("train_E6oV3lV.csv")
test_samples=pd.read_csv("test_tweets_anuFYb8.csv")

train_samples.head() # check first few columns of the train set.

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [59]:
'''
Preprocessing:

By looking at the data we can see that it contains lots of words like ('@user'), punctuation and special characters.

These are not gonna help us in analysis.So its better to remove them to get best analysis.

We are going to do 4 types of preprocessing on this data:

1) Removing @user 
2) Remove punctuations,special characters and numbers
3) Remove short words like on,at,is,him.. Typically words of length < 4
4) Stemming of words. Replacing a word with its root word is stemming. 
Words like singing, singer, sang are different forms of the word sing. Stemming replaces singing, singer, sang with a single word sing.

'''

# Combine the train and test to apply the cleansing to the data at once. 

combine = train_samples.append(test_samples,ignore_index=True,sort=False)

# Remove @user

def remove(input_text, pattern):
    
    x=re.findall(pattern,input_text)
    
    for p in x:
        input_text = re.sub(p,'',input_text)
    
    return input_text

combine['preprocessed_tweet'] = np.vectorize(remove)(combine['tweet'],"@[\w]*")


In [60]:
# Remove punctuations, numbers and special characters


combine['preprocessed_tweet'] = combine['preprocessed_tweet'].replace("[^a-zA-Z#]"," ")

In [61]:
# Remove words of length lesser than 4

combine['preprocessed_tweet'] = combine['preprocessed_tweet'].apply(lambda x : ' '.join([word for word in x.split() if len(word)>3]))

In [62]:
# Stemming

tweet_tokens = combine['preprocessed_tweet'].apply(lambda x: x.split()) 

tweet_tokens.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, can't, cause, they, do...
2                              [bihday, your, majesty]
3    [#model, love, take, with, time, urð±!!!, ð...
4                  [factsguide:, society, #motivation]
Name: preprocessed_tweet, dtype: object

In [63]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tweet_tokens = tweet_tokens.apply(lambda x : [stemmer.stem(i) for i in x])
tweet_tokens.head()

0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, can't, caus, they, don'...
2                              [bihday, your, majesti]
3    [#model, love, take, with, time, urð±!!!, ð...
4                       [factsguide:, societi, #motiv]
Name: preprocessed_tweet, dtype: object

In [64]:
for i in range(len(tweet_tokens)):

    tweet_tokens[i] = ' '.join(tweet_tokens[i])

combine['preprocessed_tweet'] = tweet_tokens