In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None

full_df = pd.read_csv("sample.csv", nrows=5000)
df = full_df[["text"]]
df["text"] = df["text"].astype(str)
full_df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


# Lower Casing

In [2]:
df["text_lower"] = df["text"].str.lower()
df.head()

Unnamed: 0,text,text_lower
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...


# Removal of Punctuations

In [3]:
df.drop(["text_lower"], axis=1, inplace=True)

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','',PUNCT_TO_REMOVE))

df["text without punction"] = df["text"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,text,text without punction
0,@AppleSupport causing the reply to be disregar...,AppleSupport causing the reply to be disregard...
1,@105835 Your business means a lot to us. Pleas...,105835 Your business means a lot to us Please ...
2,@76328 I really hope you all change but I'm su...,76328 I really hope you all change but Im sure...
3,@105836 LiveChat is online at the moment - htt...,105836 LiveChat is online at the moment https...
4,@VirginTrains see attached error message. I've...,VirginTrains see attached error message Ive tr...


# Tokenization

In [4]:
df.drop(["text without punction"], axis=1, inplace=True)
from nltk.tokenize import sent_tokenize, word_tokenize

df["tokenized_text1"] = df["text"].apply(lambda text: word_tokenize(text))
df["tokenized_text2"] = df["text"].apply(lambda text: sent_tokenize(text))
df.head()

Unnamed: 0,text,tokenized_text1,tokenized_text2
0,@AppleSupport causing the reply to be disregar...,"[@, AppleSupport, causing, the, reply, to, be,...",[@AppleSupport causing the reply to be disrega...
1,@105835 Your business means a lot to us. Pleas...,"[@, 105835, Your, business, means, a, lot, to,...","[@105835 Your business means a lot to us., Ple..."
2,@76328 I really hope you all change but I'm su...,"[@, 76328, I, really, hope, you, all, change, ...",[@76328 I really hope you all change but I'm s...
3,@105836 LiveChat is online at the moment - htt...,"[@, 105836, LiveChat, is, online, at, the, mom...",[@105836 LiveChat is online at the moment - ht...
4,@VirginTrains see attached error message. I've...,"[@, VirginTrains, see, attached, error, messag...","[@VirginTrains see attached error message., I'..."


# Removal of Stopwords

In [5]:
#list of stopwords in english
from nltk.corpus import stopwords
",".join(stopwords.words('english'))

"i,me,my,myself,we,our,ours,ourselves,you,you're,you've,you'll,you'd,your,yours,yourself,yourselves,he,him,his,himself,she,she's,her,hers,herself,it,it's,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,that'll,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,don't,should,should've,now,d,ll,m,o,re,ve,y,ain,aren,aren't,couldn,couldn't,didn,didn't,doesn,doesn't,hadn,hadn't,hasn,hasn't,haven,haven't,isn,isn't,ma,mightn,mightn't,mustn,mustn't,needn,needn't,shan,shan't,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't"

In [6]:
df.drop(["tokenized_text1","tokenized_text2"], axis=1, inplace=True)

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["text"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,text,text_wo_stop
0,@AppleSupport causing the reply to be disregar...,@AppleSupport causing reply disregarded tapped...
1,@105835 Your business means a lot to us. Pleas...,@105835 Your business means lot us. Please DM ...
2,@76328 I really hope you all change but I'm su...,@76328 I really hope change I'm sure won't! Be...
3,@105836 LiveChat is online at the moment - htt...,@105836 LiveChat online moment - https://t.co/...
4,@VirginTrains see attached error message. I've...,@VirginTrains see attached error message. I've...


# Stemming

In [7]:
from nltk.stem.porter import PorterStemmer

# Drop the two columns 
df.drop(["text_wo_stop"], axis=1, inplace=True) 

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["text"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,text,text_stemmed
0,@AppleSupport causing the reply to be disregar...,@applesupport caus the repli to be disregard a...
1,@105835 Your business means a lot to us. Pleas...,@105835 your busi mean a lot to us. pleas dm y...
2,@76328 I really hope you all change but I'm su...,@76328 i realli hope you all chang but i'm sur...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is onlin at the moment - http...
4,@VirginTrains see attached error message. I've...,@virgintrain see attach error message. i'v tri...


# Lemmatization

In [8]:
df.drop(["text_stemmed"], axis=1, inplace=True)
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["text_lemmatized"] = df["text"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,text,text_lemmatized
0,@AppleSupport causing the reply to be disregar...,@AppleSupport causing the reply to be disregar...
1,@105835 Your business means a lot to us. Pleas...,@105835 Your business mean a lot to us. Please...
2,@76328 I really hope you all change but I'm su...,@76328 I really hope you all change but I'm su...
3,@105836 LiveChat is online at the moment - htt...,@105836 LiveChat is online at the moment - htt...
4,@VirginTrains see attached error message. I've...,@VirginTrains see attached error message. I've...


In [9]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["text"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,text,text_lemmatized
0,@AppleSupport causing the reply to be disregar...,@AppleSupport cause the reply to be disregard ...
1,@105835 Your business means a lot to us. Pleas...,@105835 Your business mean a lot to us. Please...
2,@76328 I really hope you all change but I'm su...,@76328 I really hope you all change but I'm su...
3,@105836 LiveChat is online at the moment - htt...,@105836 LiveChat be online at the moment - htt...
4,@VirginTrains see attached error message. I've...,@VirginTrains see attached error message. I've...


In [10]:
import pandas
import functools
import math
import re
pandas.set_option('display.max_rows', 500)
pandas.set_option('display.max_columns', 500)
pandas.set_option('display.width', 1000)
pandas.set_option("max_column", None)

In [12]:
corpus = df["text"]
corpus.head()

0    @AppleSupport causing the reply to be disregar...
1    @105835 Your business means a lot to us. Pleas...
2    @76328 I really hope you all change but I'm su...
3    @105836 LiveChat is online at the moment - htt...
4    @VirginTrains see attached error message. I've...
Name: text, dtype: object