# 1. Preprocessing

### Reading Data

In [71]:
import nltk
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
data=pd.read_csv("twcs.csv")

In [73]:
data.head(5)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [74]:
data.shape

(2811774, 7)

In [75]:
data.columns

Index(['tweet_id', 'author_id', 'inbound', 'created_at', 'text',
       'response_tweet_id', 'in_response_to_tweet_id'],
      dtype='object')

### Lower Case

In [76]:
def lowerCase(text):
    text = text.lower()  # Convert to lowercase
    return text

In [77]:
data['text'] = data['text'].apply(lowerCase)

In [78]:
data.head(5)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 i understand. i would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare i have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 please send us a private message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare i did.,4.0,6.0


### Remove HTML Tags

In [79]:
def remove_html_tags(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    return text

In [80]:
data['text'] = data['text'].apply(remove_html_tags)



### Remove URLs

In [81]:
def remove_urls(text):
    # Remove URLs, mentions, and hashtags
    text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+', '', text)
    # Convert to lowercase
    return text

In [82]:
data['text'] = data['text'].apply(remove_urls)

### Remove punctuations

In [83]:
import string

In [84]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [85]:
exclude=string.punctuation

In [86]:
print(exclude)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [87]:
def remove_punctuation(text):
    for char in exclude:
        text=text.replace(char,"")
    return text

In [88]:
data['text'] = data['text'].apply(remove_punctuation)

### Remove Stopwords

In [89]:
import nltk

In [90]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jatpradh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jatpradh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [91]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [92]:
def remove_stopwords_from_column(column):
    stop_words = set(stopwords.words('english'))
    return column.apply(lambda text: ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words]))

In [93]:
data['text'] = remove_stopwords_from_column(data['text'])

### Remove Whitespaces

In [94]:
def remove_white_spaces_from_column(column):
    return column.apply(lambda text: ' '.join(text.split()))

In [95]:
data['text'] = remove_white_spaces_from_column(data['text'])

### Spell Checking 

In [96]:
pip install pyspellchecker

Note: you may need to restart the kernel to use updated packages.


In [97]:
from spellchecker import SpellChecker

In [98]:
def spell_check_column(column):
    spell = SpellChecker()
    return column.apply(lambda text: ' '.join([spell.correction(word) if spell.correction(word) else word for word in text.split()]))

In [99]:
# data['text'] = spell_check_column(data['text'])

### Remove emojis

In [100]:
pip install emoji==1.7.0

Note: you may need to restart the kernel to use updated packages.


In [101]:
import emoji

In [102]:
def remove_emojis_from_column(column):
    return column.apply(lambda text: ''.join(char for char in text if not char in emoji.UNICODE_EMOJI))

In [103]:
# data['text'] = remove_emojis_from_column(data['text'])

In [104]:
import re
def remove_emojis_manually(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    clean_text = emoji_pattern.sub(r'', text)
    return clean_text

In [105]:
# data['text'] = remove_emojis_from_column(data['text'])

### Handle Chat Words

In [106]:
def replace_abbreviations(text, abbreviation_dict):
    for abbreviation, full_form in abbreviation_dict.items():
        text = text.replace(abbreviation, full_form)
    return text

def load_abbreviation_dict(file_path):
    abbreviation_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            # Check if the line contains the '=' character
            if '=' in line:
                abbreviation, full_form = line.strip().split('=')
                abbreviation_dict[abbreviation] = full_form
            else:
                print(f"Skipping line without '=': {line.strip()}")
    return abbreviation_dict

In [107]:
# Load abbreviation dictionary from the file
abbreviation_dict = load_abbreviation_dict('slang.txt')

Skipping line without '=': QPSA?	Que Pasa?
Skipping line without '=': TFW â€“ That feeling when. TFW internet slang often goes in a caption to an image.
Skipping line without '=': MFW â€“ My face when
Skipping line without '=': MRW â€“ My reaction when
Skipping line without '=': IFYP â€“ I feel your pain
Skipping line without '=': LOL â€“ Laughing out loud
Skipping line without '=': TNTL â€“ Trying not to laugh
Skipping line without '=': JK â€“ Just kidding
Skipping line without '=': IDC â€“ I donâ€™t care
Skipping line without '=': ILY â€“ I love you
Skipping line without '=': IMU â€“ I miss you
Skipping line without '=': ADIH â€“ Another day in hell
Skipping line without '=': IDC â€“ I donâ€™t care
Skipping line without '=': ZZZ â€“ Sleeping, bored, tired
Skipping line without '=': WYWH â€“ Wish you were here
Skipping line without '=': TIME â€“ Tears in my eyes
Skipping line without '=': BAE â€“ Before anyone else
Skipping line without '=': FIMH â€“ Forever in my heart
Skipping line 

In [108]:
# Apply the function to replace abbreviations in the 'text' column
data['text'] = data['text'].apply(lambda x: replace_abbreviations(x, abbreviation_dict))

### Lemmatization Handling

In [None]:
# Lemmatization function
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

In [None]:
# Assuming 'data' is your DataFrame
data['text'] = data['text'].apply(lemmatize_text)

### Stemming Handling

In [None]:
# Stemming
def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)

In [None]:
# Apply stemming to the 'text' column
data['stemmed_text'] = data['text'].apply(stem_text)

# 2.                       Encoding Techniques

### One hot Encoding

##### Array size is : (2444611, 2444611) . So One Hot Encoding is not possible

In [113]:
# 1. Perform One-Hot Encoding on the 'text' column
one_hot_encoding_df = pd.get_dummies(data['text'], drop_first=True)

MemoryError: Unable to allocate 5.44 TiB for an array with shape (2444611, 2444611) and data type uint8

In [None]:
# 2 . Display the DataFrame after one-hot encoding
print("\nDataFrame after One-Hot Encoding:")
print(one_hot_encoding_df)

### BOW ( Bag Of Words )

##### Memory Error due to huge size

In [115]:
from sklearn.feature_extraction.text import CountVectorizer

In [116]:
# 1. Perform Bag-of-Words (BOW) from the 'text' column
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(data['text'])

In [117]:
# 2 . Create a DataFrame from the BOW matrix
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

MemoryError: Unable to allocate 8.87 TiB for an array with shape (2811774, 433677) and data type int64

In [None]:
# 3 . Display the DataFrame after BOW
print("\nDataFrame after Bag-of-Words (BOW):")
print(bow_df)

### N-Grams

#### Gives memory error

In [118]:
# 1. Perform N-grams (2-gram, 3-gram, 4-gram) using BOW
ngram_vectorizer = CountVectorizer(ngram_range=(2, 4))
ngram_matrix = ngram_vectorizer.fit_transform(data['text'])

In [120]:
# 2 . Get the vocabulary.
# ngram_matrix.vocabulary_
vocabulary = ngram_vectorizer.get_feature_names_out()

In [121]:
# 3 . Create a DataFrame from the N-gram matrix
ngram_df = pd.DataFrame(ngram_matrix.toarray(), columns=ngram_vectorizer.get_feature_names_out())

MemoryError: Unable to allocate 682. TiB for an array with shape (2811774, 33355942) and data type int64

In [122]:
# 4 . Display the DataFrame after N-grams
print("\nDataFrame after N-grams:")
print(ngram_df)


DataFrame after N-grams:


NameError: name 'ngram_df' is not defined

### tf-idf

#### Shows memory error

In [124]:
# 1. Perform TF-IDF on the 'text' column
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])

# 2. Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# 3 . Display the DataFrame after TF-IDF
print("\nDataFrame after TF-IDF:")
print(tfidf_df)

MemoryError: Unable to allocate 8.87 TiB for an array with shape (2811774, 433677) and data type float64