# Import libraries

In [49]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
import emoji
from nltk.tokenize import word_tokenize,sent_tokenize
import spacy
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from textblob import TextBlob
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Dataset

In [2]:
df = pd.read_csv('twcs.csv')
df

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4,6.0
...,...,...,...,...,...,...,...
2811769,2987947,sprintcare,False,Wed Nov 22 08:43:51 +0000 2017,"@823869 Hey, we'd be happy to look into this f...",,2987948.0
2811770,2987948,823869,True,Wed Nov 22 08:35:16 +0000 2017,@115714 wtf!? I’ve been having really shitty s...,2987947,
2811771,2812240,121673,True,Thu Nov 23 04:13:07 +0000 2017,@143549 @sprintcare You have to go to https://...,,2812239.0
2811772,2987949,AldiUK,False,Wed Nov 22 08:31:24 +0000 2017,"@823870 Sounds delicious, Sarah! 😋 https://t.c...",,2987950.0


# Check the shape of dataset and datatype of columns

In [3]:
df.shape

(2811774, 7)

In [4]:
df.dtypes

tweet_id                     int64
author_id                   object
inbound                       bool
created_at                  object
text                        object
response_tweet_id           object
in_response_to_tweet_id    float64
dtype: object

We can see here that the datatype of "text" column is object, so we need to perform Type-Casting(changing data type of a variable to another to perform different operations) here by converting it into string.

In [5]:
df["text"] = df["text"].astype('string')

To perform all the activities in this assignment we know that only text column is requires. So, let's make another dataframe that will have only text column.

In [6]:
df_text = df[["text"]]

Again check the datatype of text column to make sure it has been converted into string.

In [7]:
df_text.dtypes['text']

string[python]

To see first 5 rows of dataframe: 

In [8]:
df_text.head()

Unnamed: 0,text
0,@115712 I understand. I would like to assist y...
1,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...
3,@115712 Please send us a Private Message so th...
4,@sprintcare I did.


To see any random 7 rows of dataframe:

In [9]:
df_text.sample(7)

Unnamed: 0,text
1219649,@Safaricom_Care LJR0VZ46AO
619024,@285704 Fun was had by all!
1687837,"@115911 No matter where I go #Niagarafalls, #N..."
2414308,@115858 my best friend lives 257 miles away an...
2129594,@665387 Hi! We'd be happy to help. Please send...
2670778,So because your driver doesn't know how to tak...
1707454,"@555917 No problem, Will. :-) Thank you for y..."


To see last 5 rows of dataframe:

In [10]:
df_text.tail()

Unnamed: 0,text
2811769,"@823869 Hey, we'd be happy to look into this f..."
2811770,@115714 wtf!? I’ve been having really shitty s...
2811771,@143549 @sprintcare You have to go to https://...
2811772,"@823870 Sounds delicious, Sarah! 😋 https://t.c..."
2811773,@AldiUK warm sloe gin mince pies with ice cre...


# Lowercasing

Let's keep the text column as it is and create a new column "lower_text" to store lowercase text.

In [11]:
df_text["lower_text"] = df_text["text"].str.lower()
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["text"].str.lower()


Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...
4,@sprintcare I did.,@sprintcare i did.


# Punctutation

In [12]:
exclude=string.punctuation

In [13]:
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
def remove_punc(text):
    for char in exclude:
        text=text.replace(char,"")
    return text

In [15]:
df_text["lower_text"] = df_text["lower_text"].apply(remove_punc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["lower_text"].apply(remove_punc)


In [16]:
df_text.head()

Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,115712 i understand i would like to assist you...
1,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,sprintcare i have sent several private message...
3,@115712 Please send us a Private Message so th...,115712 please send us a private message so tha...
4,@sprintcare I did.,sprintcare i did


# Numbers

In [17]:
# Removing numbers from text
df_text["lower_text"] = df_text["lower_text"].str.replace('\d+', '')

  df_text["lower_text"] = df_text["lower_text"].str.replace('\d+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["lower_text"].str.replace('\d+', '')


In [18]:
df_text.head()

Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,i understand i would like to assist you we wo...
1,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,sprintcare i have sent several private message...
3,@115712 Please send us a Private Message so th...,please send us a private message so that we c...
4,@sprintcare I did.,sprintcare i did


# Stopwords

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaushiki./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_stopwords(text))
df_text.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_stopwords(text))


Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...
4,@sprintcare I did.,sprintcare


# Removal of URL's

In [21]:
def remove_url(text):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub("",text)
df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_url(text))
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_url(text))


Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...
4,@sprintcare I did.,sprintcare


# Removal of HTML Tags

In [22]:
def remove_html_tags(text):
    pattern=re.compile('<.*?>')
    return pattern.sub("",text)
df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_html_tags(text))
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_html_tags(text))


Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...
4,@sprintcare I did.,sprintcare


# Chat Conversation Removal

In [23]:
# Replace 'your_file.txt' with the actual path to your text file
file_path = 'slang.txt'

# Assuming your text file is tab-separated
df_slang = pd.read_csv(file_path, sep='=')

df_slang.columns = ['Slang','Full_Form']
df_slang.head()
# Convert DataFrame to dictionary
chat_words = dict(zip(df_slang['Slang'], df_slang['Full_Form']))

# Display the resulting dictionary
print(chat_words)

{'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible', 'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace', 'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon', 'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back', 'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before', 'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You', 'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed', 'FWIW': "For What It's Worth", 'FYI': 'For Your Information', 'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night', 'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius', 'IC': 'I See', 'ICQ': 'I Seek you (also a chat program)', 'ILU': 'ILU: I Love You', 'IMHO': 'In My Honest/Humble Opinion', 'IMO': 'In My Opinion', 'IOW': 'In Other Words', 'IRL': 'In Real Life', 'KISS': 'Keep It Simple, Stupid', 'LDR': 'Long Distance Relationship', 'LMAO': 'Laugh My A.. Off', 'LOL': 'Laughing Out Loud', 'LTNS': 'Long

In [24]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

df_text["lower_text"] = df_text["lower_text"].apply(lambda text: chat_conversion(text))
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["lower_text"].apply(lambda text: chat_conversion(text))


Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...
4,@sprintcare I did.,sprintcare


# Emoji Removal

In [25]:
def remove_emoji(text):
    clean_text=emoji.demojize(text)
    return clean_text
df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_emoji(text))
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["lower_text"] = df_text["lower_text"].apply(lambda text: remove_emoji(text))


Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...
4,@sprintcare I did.,sprintcare


# Tokenization

Using split method

In [26]:
# Tokenization function using split method
def tokenize(text):
    return text.split()

# Apply the tokenization function to the DataFrame
df_text['tokens'] = df_text['lower_text'].apply(tokenize)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['tokens'] = df_text['lower_text'].apply(tokenize)


Unnamed: 0,text,lower_text,tokens
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[understand, would, like, assist, would, need,..."
1,@sprintcare and how do you propose we do that,sprintcare propose,"[sprintcare, propose]"
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[sprintcare, sent, several, private, messages,..."
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[please, send, us, private, message, assist, c..."
4,@sprintcare I did.,sprintcare,[sprintcare]


Using NLTK

In [27]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/kaushiki./nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
def tokenize_nltk(text):
    return nltk.word_tokenize(text)
df_text['NLTK_tokens'] = df_text['lower_text'].apply(tokenize_nltk)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['NLTK_tokens'] = df_text['lower_text'].apply(tokenize_nltk)


Unnamed: 0,text,lower_text,tokens,NLTK_tokens
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[understand, would, like, assist, would, need,...","[understand, would, like, assist, would, need,..."
1,@sprintcare and how do you propose we do that,sprintcare propose,"[sprintcare, propose]","[sprintcare, propose]"
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[sprintcare, sent, several, private, messages,...","[sprintcare, sent, several, private, messages,..."
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[please, send, us, private, message, assist, c...","[please, send, us, private, message, assist, c..."
4,@sprintcare I did.,sprintcare,[sprintcare],[sprintcare]


Sentence tokenization using NLTK

In [29]:
def sent_tokenize_nltk(text):
    return sent_tokenize(text)
df_text['Sent_tokens'] = df_text['lower_text'].apply(sent_tokenize_nltk)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Sent_tokens'] = df_text['lower_text'].apply(sent_tokenize_nltk)


Unnamed: 0,text,lower_text,tokens,NLTK_tokens,Sent_tokens
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[understand, would, like, assist, would, need,...","[understand, would, like, assist, would, need,...",[understand would like assist would need get p...
1,@sprintcare and how do you propose we do that,sprintcare propose,"[sprintcare, propose]","[sprintcare, propose]",[sprintcare propose]
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[sprintcare, sent, several, private, messages,...","[sprintcare, sent, several, private, messages,...",[sprintcare sent several private messages one ...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[please, send, us, private, message, assist, c...","[please, send, us, private, message, assist, c...",[please send us private message assist click ‘...
4,@sprintcare I did.,sprintcare,[sprintcare],[sprintcare],[sprintcare]


# Spell Check

In [None]:
# Function to correct spelling using TextBlob
def correct_spelling(words):
    #words = word_tokenize(text)
    corrected_words = [str(TextBlob(word).correct()) for word in words]
    corrected_text = ' '.join(corrected_words)
    return corrected_text

# Function to apply spelling correction to a DataFrame column
def apply_spelling_correction(dataframe, column):
    dataframe[column] = dataframe[column].apply(correct_spelling)
    return dataframe

# Applying spelling correction to the DataFrame
df_text['lower_text'] = apply_spelling_correction(df_text, 'NLTK_tokens')

# Stemming

In [30]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df_text["stemmed_text"] = df_text["lower_text"].apply(lambda text: stem_words(text))
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text["stemmed_text"] = df_text["lower_text"].apply(lambda text: stem_words(text))


Unnamed: 0,text,lower_text,tokens,NLTK_tokens,Sent_tokens,stemmed_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[understand, would, like, assist, would, need,...","[understand, would, like, assist, would, need,...",[understand would like assist would need get p...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose,"[sprintcare, propose]","[sprintcare, propose]",[sprintcare propose],sprintcar propos
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[sprintcare, sent, several, private, messages,...","[sprintcare, sent, several, private, messages,...",[sprintcare sent several private messages one ...,sprintcar sent sever privat messag one respond...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[please, send, us, private, message, assist, c...","[please, send, us, private, message, assist, c...",[please send us private message assist click ‘...,pleas send us privat messag assist click ‘mess...
4,@sprintcare I did.,sprintcare,[sprintcare],[sprintcare],[sprintcare],sprintcar


# Lemmatization

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaushiki./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kaushiki./nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [33]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
df_text["lemmatized_text"] = df_text["lower_text"].apply(lambda text: lemmatize_words(text))
df_text.head()

Unnamed: 0,text,lower_text,tokens,NLTK_tokens,Sent_tokens,stemmed_text,lemmatized_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[understand, would, like, assist, would, need,...","[understand, would, like, assist, would, need,...",[understand would like assist would need get p...,understand would like assist would need get pr...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose,"[sprintcare, propose]","[sprintcare, propose]",[sprintcare propose],sprintcar propos,sprintcare propose
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[sprintcare, sent, several, private, messages,...","[sprintcare, sent, several, private, messages,...",[sprintcare sent several private messages one ...,sprintcar sent sever privat messag one respond...,sprintcare sent several private message one re...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[please, send, us, private, message, assist, c...","[please, send, us, private, message, assist, c...",[please send us private message assist click ‘...,pleas send us privat messag assist click ‘mess...,please send u private message assist click ‘me...
4,@sprintcare I did.,sprintcare,[sprintcare],[sprintcare],[sprintcare],sprintcar,sprintcare


# POS Tagging

In [34]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kaushiki./nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [35]:
def pos_tagging(tokens):
    # Perform POS tagging
    tagged_tokens = pos_tag(tokens)
    
    return tagged_tokens

# Apply the POS tagging function to the DataFrame
df_text['pos_tags'] = df_text['NLTK_tokens'].apply(pos_tagging)
df_text.head()

Unnamed: 0,text,lower_text,tokens,NLTK_tokens,Sent_tokens,stemmed_text,lemmatized_text,pos_tags
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[understand, would, like, assist, would, need,...","[understand, would, like, assist, would, need,...",[understand would like assist would need get p...,understand would like assist would need get pr...,understand would like assist would need get pr...,"[(understand, RB), (would, MD), (like, VB), (a..."
1,@sprintcare and how do you propose we do that,sprintcare propose,"[sprintcare, propose]","[sprintcare, propose]",[sprintcare propose],sprintcar propos,sprintcare propose,"[(sprintcare, NN), (propose, NN)]"
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[sprintcare, sent, several, private, messages,...","[sprintcare, sent, several, private, messages,...",[sprintcare sent several private messages one ...,sprintcar sent sever privat messag one respond...,sprintcare sent several private message one re...,"[(sprintcare, NN), (sent, VBD), (several, JJ),..."
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[please, send, us, private, message, assist, c...","[please, send, us, private, message, assist, c...",[please send us private message assist click ‘...,pleas send us privat messag assist click ‘mess...,please send u private message assist click ‘me...,"[(please, VB), (send, VB), (us, PRP), (private..."
4,@sprintcare I did.,sprintcare,[sprintcare],[sprintcare],[sprintcare],sprintcar,sprintcare,"[(sprintcare, NN)]"


# Sentiment Analysis

In [36]:
# Function to perform sentiment analysis using TextBlob
def analyze_sentiment(text):
    analysis = TextBlob(str(text))
    # Classify the polarity as positive, negative, or neutral
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the sentiment analysis function to the 'text' column and create a new column 'sentiment'
df_text['sentiment'] = df_text['lower_text'].apply(analyze_sentiment)
df_text.head()

Unnamed: 0,text,lower_text,tokens,NLTK_tokens,Sent_tokens,stemmed_text,lemmatized_text,pos_tags,sentiment
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[understand, would, like, assist, would, need,...","[understand, would, like, assist, would, need,...",[understand would like assist would need get p...,understand would like assist would need get pr...,understand would like assist would need get pr...,"[(understand, RB), (would, MD), (like, VB), (a...",Neutral
1,@sprintcare and how do you propose we do that,sprintcare propose,"[sprintcare, propose]","[sprintcare, propose]",[sprintcare propose],sprintcar propos,sprintcare propose,"[(sprintcare, NN), (propose, NN)]",Neutral
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[sprintcare, sent, several, private, messages,...","[sprintcare, sent, several, private, messages,...",[sprintcare sent several private messages one ...,sprintcar sent sever privat messag one respond...,sprintcare sent several private message one re...,"[(sprintcare, NN), (sent, VBD), (several, JJ),...",Negative
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[please, send, us, private, message, assist, c...","[please, send, us, private, message, assist, c...",[please send us private message assist click ‘...,pleas send us privat messag assist click ‘mess...,please send u private message assist click ‘me...,"[(please, VB), (send, VB), (us, PRP), (private...",Positive
4,@sprintcare I did.,sprintcare,[sprintcare],[sprintcare],[sprintcare],sprintcar,sprintcare,"[(sprintcare, NN)]",Neutral


# One-Hot Encoding

Since the dataset is very large, it is not feasible to perform one-hot encoding, bow, bi-grams, tri-grams and tf-idf  for whole dataset. So, we will do it for first 10 rows only.

In [52]:
df_ten = df_text.head(10)

In [53]:
df_ten['lower_text'].nunique()

10

In [54]:
df_ten

Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...
1,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...
4,@sprintcare I did.,sprintcare
5,@115712 Can you please send us a private messa...,please send us private message gain details ac...
6,@sprintcare is the worst customer service,sprintcare worst customer service
7,@115713 This is saddening to hear. Please shoo...,saddening hear please shoot us dm look kc
8,@sprintcare You gonna magically change your co...,sprintcare gonna magically change connectivity...
9,@115713 We understand your concerns and we'd l...,understand concerns wed like please send us di...


In [55]:
df_ohe = pd.get_dummies(df_ten, columns = ['lower_text']) 

In [56]:
df_ohe

Unnamed: 0,text,lower_text_please send us private message assist click ‘message’ top profile,lower_text_please send us private message gain details account,lower_text_saddening hear please shoot us dm look kc,lower_text_sprintcare,lower_text_sprintcare gonna magically change connectivity whole family :lying_face: :hundred_points:,lower_text_sprintcare propose,lower_text_sprintcare sent several private messages one responding usual,lower_text_sprintcare worst customer service,lower_text_understand concerns wed like please send us direct message assist aa,lower_text_understand would like assist would need get private secured link assist
0,@115712 I understand. I would like to assist y...,0,0,0,0,0,0,0,0,0,1
1,@sprintcare and how do you propose we do that,0,0,0,0,0,1,0,0,0,0
2,@sprintcare I have sent several private messag...,0,0,0,0,0,0,1,0,0,0
3,@115712 Please send us a Private Message so th...,1,0,0,0,0,0,0,0,0,0
4,@sprintcare I did.,0,0,0,1,0,0,0,0,0,0
5,@115712 Can you please send us a private messa...,0,1,0,0,0,0,0,0,0,0
6,@sprintcare is the worst customer service,0,0,0,0,0,0,0,1,0,0
7,@115713 This is saddening to hear. Please shoo...,0,0,1,0,0,0,0,0,0,0
8,@sprintcare You gonna magically change your co...,0,0,0,0,1,0,0,0,0,0
9,@115713 We understand your concerns and we'd l...,0,0,0,0,0,0,0,0,1,0


# Bag-of-Words(BOW)

In [57]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the 'Text' column to obtain the BOW representation
bow_matrix = vectorizer.fit_transform(df_ten['lower_text'])

# Create a DataFrame from the BOW matrix
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the BOW DataFrame
df_bow = pd.concat([df_ten, bow_df], axis=1)

In [58]:
df_bow

Unnamed: 0,text,lower_text,aa,account,assist,change,click,concerns,connectivity,customer,...,shoot,sprintcare,top,understand,us,usual,wed,whole,worst,would
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,0,0,2,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2
1,@sprintcare and how do you propose we do that,sprintcare propose,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,0,0,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,@sprintcare I did.,sprintcare,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,@115712 Can you please send us a private messa...,please send us private message gain details ac...,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,@sprintcare is the worst customer service,sprintcare worst customer service,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
7,@115713 This is saddening to hear. Please shoo...,saddening hear please shoot us dm look kc,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
8,@sprintcare You gonna magically change your co...,sprintcare gonna magically change connectivity...,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
9,@115713 We understand your concerns and we'd l...,understand concerns wed like please send us di...,1,0,1,0,0,1,0,0,...,0,0,0,1,1,0,1,0,0,0


# Bi-Grams

In [59]:
# Function to tokenize and create bigrams
def create_bigrams(text):
    tokens = word_tokenize(text.lower())  # You can customize tokenization based on your needs
    bigrams = list(ngrams(tokens, 2))  # Adjust the number '2' to create n-grams of different sizes
    return bigrams

# Apply the function to the 'text' column and create a new column 'bigrams'
df_ten['bigrams'] = df_ten['lower_text'].apply(create_bigrams)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ten['bigrams'] = df_ten['lower_text'].apply(create_bigrams)


# Tri-Grams

In [60]:
# Function to tokenize and create trigrams
def create_trigrams(text):
    tokens = word_tokenize(text.lower()) 
    trigrams = list(ngrams(tokens, 3))  
    return trigrams

# Apply the function to the 'text' column and create a new column 'bigrams'
df_ten['trigrams'] = df_ten['lower_text'].apply(create_trigrams)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ten['trigrams'] = df_ten['lower_text'].apply(create_trigrams)


# TF-IDF

In [61]:
# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'text' column to get the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df_ten['lower_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the TF-IDF DataFrame
result_df = pd.concat([df_ten, tfidf_df], axis=1)

In [62]:
result_df

Unnamed: 0,text,lower_text,bigrams,trigrams,aa,account,assist,change,click,concerns,...,shoot,sprintcare,top,understand,us,usual,wed,whole,worst,would
0,@115712 I understand. I would like to assist y...,understand would like assist would need get pr...,"[(understand, would), (would, like), (like, as...","[(understand, would, like), (would, like, assi...",0.0,0.0,0.427702,0.0,0.0,0.0,...,0.0,0.0,0.0,0.244434,0.0,0.0,0.0,0.0,0.0,0.575077
1,@sprintcare and how do you propose we do that,sprintcare propose,"[(sprintcare, propose)]",[],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.510584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,@sprintcare I have sent several private messag...,sprintcare sent several private messages one r...,"[(sprintcare, sent), (sent, several), (several...","[(sprintcare, sent, several), (sent, several, ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.22789,0.0,0.0,0.0,0.383769,0.0,0.0,0.0,0.0
3,@115712 Please send us a Private Message so th...,please send us private message assist click ‘m...,"[(please, send), (send, us), (us, private), (p...","[(please, send, us), (send, us, private), (us,...",0.0,0.0,0.26924,0.0,0.362013,0.0,...,0.0,0.0,0.362013,0.0,0.239373,0.0,0.0,0.0,0.0,0.0
4,@sprintcare I did.,sprintcare,[],[],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,@115712 Can you please send us a private messa...,please send us private message gain details ac...,"[(please, send), (send, us), (us, private), (p...","[(please, send, us), (send, us, private), (us,...",0.0,0.429619,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.284076,0.0,0.0,0.0,0.0,0.0
6,@sprintcare is the worst customer service,sprintcare worst customer service,"[(sprintcare, worst), (worst, customer), (cust...","[(sprintcare, worst, customer), (worst, custom...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.324312,0.0,0.0,0.0,0.0,0.0,0.0,0.546145,0.0
7,@115713 This is saddening to hear. Please shoo...,saddening hear please shoot us dm look kc,"[(saddening, hear), (hear, please), (please, s...","[(saddening, hear, please), (hear, please, sho...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.3814,0.0,0.0,0.0,0.252193,0.0,0.0,0.0,0.0,0.0
8,@sprintcare You gonna magically change your co...,sprintcare gonna magically change connectivity...,"[(sprintcare, gon), (gon, na), (na, magically)...","[(sprintcare, gon, na), (gon, na, magically), ...",0.0,0.0,0.0,0.34601,0.0,0.0,...,0.0,0.205468,0.0,0.0,0.0,0.0,0.0,0.34601,0.0,0.0
9,@115713 We understand your concerns and we'd l...,understand concerns wed like please send us di...,"[(understand, concerns), (concerns, wed), (wed...","[(understand, concerns, wed), (concerns, wed, ...",0.354015,0.0,0.263291,0.0,0.0,0.354015,...,0.0,0.0,0.0,0.300945,0.234085,0.0,0.354015,0.0,0.0,0.0


# Five Advantages and Disadvantages 

# OHE

Advantages:

1. Intuitive Representation: One-hot encoding provides a straightforward and intuitive representation of categorical variables, where each category is uniquely represented by a binary value (0 or 1).
2. Compatibility with Algorithms: Many machine learning algorithms, especially those based on linear algebra and numerical computations, work well with one-hot encoded data, making it compatible with a wide range of models.
3. Prevention of Implicit Ordinal Relationships: One-hot encoding avoids introducing implicit ordinal relationships among categories, which can be a concern when using label encoding for categorical variables.
4. Handling of Missing Data: One-hot encoding handles missing data gracefully. If a value is missing for a particular category, all corresponding one-hot encoded values for that category will be zero, indicating the absence of that category.
5. Improved Model Performance: In certain scenarios, using one-hot encoding can lead to improved model performance, especially when dealing with categorical variables that do not have a natural ordinal relationship.

Disadvantages:

1. Increased Dimensionality: One-hot encoding can significantly increase the dimensionality of the dataset, leading to a sparse matrix with many zero values, which might impact the efficiency and storage requirements of the model.
2. Curse of Dimensionality: The increased dimensionality can contribute to the "curse of dimensionality," leading to increased computational complexity and potentially requiring more data to train models effectively.
3. Correlated Features: One-hot encoding introduces perfect correlation among the encoded features, which can be problematic for certain algorithms that assume feature independence.
4. Loss of Semantics: One-hot encoding discards any inherent semantic information in the order or proximity of categories, treating all categories as equally dissimilar.
5. Ineffective for High Cardinality: When dealing with categorical variables with high cardinality (a large number of unique categories), one-hot encoding can become impractical and lead to excessive memory usage and computational overhead.

# N-Grams

Advantages:

1. Capture Contextual Information: N-grams, especially bigrams and trigrams, help capture contextual information by considering the relationships between adjacent words, providing a more nuanced understanding of language.

2. Feature Extraction for Text Analysis: N-grams are widely used in natural language processing tasks such as text classification, sentiment analysis, and machine translation, serving as valuable features for these applications.

3. Improved Language Models: N-grams are foundational in building language models, contributing to better predictions and more accurate representations of language structures, which is essential for various NLP tasks.

4. Flexible Representation: N-grams can be customized to different sizes (unigrams, bigrams, trigrams, etc.), allowing flexibility in adapting to specific linguistic patterns or contextual nuances within a given dataset or language.

5. Efficient Computation: In some cases, the computation of n-grams can be more efficient than considering all possible combinations of words, especially when dealing with large datasets, making them suitable for scalable language processing tasks.

Disadvantages:

1. Data Sparsity: For larger n-grams (e.g., 4-grams or higher), data sparsity becomes a challenge as the occurrences of specific combinations of words decrease, leading to less reliable frequency estimates and potentially compromising the model's effectiveness.

2. Loss of Word Order: N-grams may lose the exact word order in longer sequences, potentially overlooking the nuanced meaning conveyed by the specific arrangement of words.

3. Increased Dimensionality: As the size of n-grams increases, the dimensionality of the feature space grows, which can lead to increased computational complexity and memory requirements, especially in high-dimensional spaces.

4. Overfitting Issues: Including higher-order n-grams may result in overfitting, especially when the dataset is limited. Models might capture noise or specific patterns that are not generalizable to unseen data.

5. Difficulty Handling Out-of-Vocabulary Words: N-grams struggle to handle out-of-vocabulary words or unseen combinations, which can be problematic when dealing with evolving vocabularies or domain-specific terminology.

# TF-IDF

Advantages:

1. Term Importance: Highlights important terms by assigning higher weights to terms that are frequent in a document but rare across the entire corpus.
2. Simple and Efficient: Easy to implement and computationally efficient for large datasets.
3. Versatility: Applicable to various natural language processing tasks such as document retrieval, clustering, and classification.
4. Reduced Sensitivity to Document Length: Normalizes term frequencies based on document length, making it less sensitive to the length of documents.
5. Interpretability: Provides a numerical representation of document content, aiding in the interpretability of the underlying textual data.

Disadvantages:

1. Bag-of-Words Limitation: Ignores word order and semantics, treating documents as unordered collections of words.
2. Vocabulary Size: Can result in large and sparse matrices for extensive vocabularies, requiring significant memory resources.
3. Ignores Document Structure: Fails to capture syntactic or structural information within documents.
4. Sensitivity to Stop Words: May be sensitive to the presence of common stop words, impacting the effectiveness of the method.
5. Lack of Semantic Understanding: Lacks true semantic understanding of language, making it less effective for tasks requiring deeper language comprehension.






# BOW

Advantages:

1. Simplicity and Efficiency
2. Versatility
3. Independence of Word Order
4. Feature Representation
5. Vocabulary Control

Disadvantages:

1. Loss of Sequence Information
2. Lack of Semantic Understanding
3. Fixed-Length Representation
4. Vocabulary Size
5. Sensitivity to Stop Words
