In [1]:
import numpy as np
import pandas as pd
import os
from IPython.display import display

# os.listdir('/kaggle/')
for dirpath, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirpath, filename))

/kaggle/input/customer-support-on-twitter/sample.csv
/kaggle/input/customer-support-on-twitter/twcs/twcs.csv


In [2]:
df = pd.read_csv('../input/customer-support-on-twitter/twcs/twcs.csv', nrows=5000)
display(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tweet_id                 5000 non-null   int64  
 1   author_id                5000 non-null   object 
 2   inbound                  5000 non-null   bool   
 3   created_at               5000 non-null   object 
 4   text                     5000 non-null   object 
 5   response_tweet_id        3411 non-null   object 
 6   in_response_to_tweet_id  3694 non-null   float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 239.4+ KB


None

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [3]:
# Let's convert the text column to string to allow for easy string manipulation
text_df = df[['text']].astype(str)
text_df.head()

Unnamed: 0,text
0,@115712 I understand. I would like to assist y...
1,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...
3,@115712 Please send us a Private Message so th...
4,@sprintcare I did.


## Lower Case
Let's make all text lower case characters

In [4]:
text_df['text_lower'] = text_df['text'].map(lambda text: text.lower())
text_df.head()

Unnamed: 0,text,text_lower
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...
4,@sprintcare I did.,@sprintcare i did.


## Removal of punctuations

I want to learn new stuff, so I'll be trying an adaptation of this [notebook](https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing/notebook). I will be making use of its content with a few modifications here.

In thsis section, I'll try to remove punctuation from each text

In [5]:
import string

PUNCTUATIONS = string.punctuation

def remove_punctuations(text):
    return text.translate(str.maketrans('', '', PUNCTUATIONS))

In [6]:
text_df['text_no_punct'] = text_df['text'].apply(lambda text: remove_punctuations(text))
text_df.head()

Unnamed: 0,text,text_lower,text_no_punct
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did


In [7]:
# text_df.loc[2, ['text', 'text_no_punct']]
display(text_df['text'][0])
text_df['text_no_punct'][0]

'@115712 I understand. I would like to assist you. We would need to get you into a private secured link to further assist.'

'115712 I understand I would like to assist you We would need to get you into a private secured link to further assist'

## Removal of stop words
Let's attempt to remove stop words here; words that do not have much effect exvept if we're considering Parts of Speech in our work

In [8]:
from nltk.corpus import stopwords
', '.join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [9]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """Removes stopwords from a section of text and returns text without stopwords"""
    return ' '.join([word for word in str(text).split() if word not in STOPWORDS])

# text_df.drop('text_lower', axis=1, inplace=True)
text_df['text_no_stopword'] = text_df['text_no_punct'].apply(lambda text: remove_stopwords(text))
text_df.head()

Unnamed: 0,text,text_lower,text_no_punct,text_no_stopword
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,115712 I understand I would like assist We wou...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,sprintcare I sent several private messages one...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,115712 Please send us Private Message assist J...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,sprintcare I


## Removal of Frequent words
Let me try removing frequent words here

In [10]:
from collections import Counter

text_list = []
text_df['text_no_stopword'].apply(lambda text: text_list.extend(text.split(' ')))

count = Counter(text_list)
count.most_common(10)

[('I', 1437),
 ('us', 752),
 ('DM', 514),
 ('help', 479),
 ('Please', 376),
 ('We', 338),
 ('Hi', 293),
 ('Thanks', 287),
 ('get', 279),
 ('please', 247)]

In [11]:
# remove frequent words
FREQWORDS = set([w for (w, wc) in count.most_common(10)])

def remove_freqword(text):
    """Remove frequent words and return the text"""
    return ' '.join([word for word in str(text).split() if word not in FREQWORDS])

text_df['text_no_freq'] = text_df['text_no_stopword'].apply(lambda text: remove_freqword(text))
text_df.head()

Unnamed: 0,text,text_lower,text_no_punct,text_no_stopword,text_no_freq
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,115712 I understand I would like assist We wou...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,sprintcare propose,sprintcare propose
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,sprintcare I sent several private messages one...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,115712 Please send us Private Message assist J...,115712 send Private Message assist Just click ...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,sprintcare I,sprintcare


In [12]:
text_df['text_no_stopword'][1]

'sprintcare propose'

## Removal of rare words
This is basically the same thing as removal of frequent words. We just access the end of the list provided by the counter and strip the rare words from the text

In [26]:
no_of_rare_words = 20

RAREWORDS = set([w for (w, wc) in count.most_common()[: -no_of_rare_words-1: -1]])

def remove_rare(text):
    return ' '.join([word for word in str(text).split() if word not in RAREWORDS])

# text without stop words, frequent words and rare words
text_df['text_no_stopfreqrare'] = text_df['text_no_freq'].apply(lambda text: remove_rare(text))

text_df.head()

Unnamed: 0,text,text_lower,text_no_punct,text_no_stopword,text_no_freq,text_no_stopfreqrare
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,115712 I understand I would like assist We wou...,115712 understand would like assist would need...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,sprintcare propose,sprintcare propose,sprintcare propose
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,sprintcare I sent several private messages one...,sprintcare sent several private messages one r...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,115712 Please send us Private Message assist J...,115712 send Private Message assist Just click ...,115712 send Private Message assist Just click ...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,sprintcare I,sprintcare,sprintcare


In [28]:
text_df.to_csv('modified_text.csv')