# **Preprocessing**

## **Importing drive and required libraries**

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None


## **Reading the dataset**

In [44]:
df= pd.read_csv("/content/drive/MyDrive/all_comments.csv")
df

Unnamed: 0,Comment
0,"Shut up plebs, dont argue with the $cience, no..."
1,"Sea level rise is already baked in, water fron..."
2,"And remember, it's your fault,with your soggy ..."
3,I never heard so much bullshit in my life…a re...
4,Now what?
...,...
12349,"In 2022, soylent green will be available. So I..."
12350,"The Earth is Warming at an alarming rate, Glob..."
12351,an idiot show to proof their climate change pr...
12352,kids got this👏👏. 6 years old Triplets who pick...


## **Converting the comment column to string**

In [45]:
df["Comment"] = df["Comment"].astype(str)    ##convert to string

## **Filter out comments that are not primarily in English**

In [46]:
# Install the library if not already installed
!pip install langdetect

# Import the language detection function
from langdetect import detect

def filter_english_comments(df, english_threshold=0.7):

    english_comments = []
    for comment in df['Comment']:
        try:
            # Detect the language of the comment
            lang = detect(comment)
            # Calculate the proportion of English text
            if lang == 'en':
                english_comments.append(comment)
            else:
                # Calculate the proportion of English text in mixed-language comments
                english_ratio = sum(1 for char in comment if char.isascii()) / len(comment)
                if english_ratio >= english_threshold:
                    english_comments.append(comment)
        except:
            # Handle errors in language detection
            pass
    return df[df['Comment'].isin(english_comments)]

english_comments_df = filter_english_comments(df)




In [47]:
english_comments_df

Unnamed: 0,Comment
0,"Shut up plebs, dont argue with the $cience, no..."
1,"Sea level rise is already baked in, water fron..."
2,"And remember, it's your fault,with your soggy ..."
3,I never heard so much bullshit in my life…a re...
4,Now what?
...,...
12349,"In 2022, soylent green will be available. So I..."
12350,"The Earth is Warming at an alarming rate, Glob..."
12351,an idiot show to proof their climate change pr...
12352,kids got this👏👏. 6 years old Triplets who pick...


## **Drop the empty comments**

In [48]:
# Remove empty rows from the DataFrame
english_comments_df = english_comments_df.dropna()

# Reset the index of the DataFrame
english_comments_df = english_comments_df.reset_index(drop=True)

english_comments_df


Unnamed: 0,Comment
0,"Shut up plebs, dont argue with the $cience, no..."
1,"Sea level rise is already baked in, water fron..."
2,"And remember, it's your fault,with your soggy ..."
3,I never heard so much bullshit in my life…a re...
4,Now what?
...,...
12086,"In 2022, soylent green will be available. So I..."
12087,"The Earth is Warming at an alarming rate, Glob..."
12088,an idiot show to proof their climate change pr...
12089,kids got this👏👏. 6 years old Triplets who pick...


In [49]:
df1= english_comments_df
df1

Unnamed: 0,Comment
0,"Shut up plebs, dont argue with the $cience, no..."
1,"Sea level rise is already baked in, water fron..."
2,"And remember, it's your fault,with your soggy ..."
3,I never heard so much bullshit in my life…a re...
4,Now what?
...,...
12086,"In 2022, soylent green will be available. So I..."
12087,"The Earth is Warming at an alarming rate, Glob..."
12088,an idiot show to proof their climate change pr...
12089,kids got this👏👏. 6 years old Triplets who pick...


## **Lowercasing**

In [50]:
#Lowercasing
df1["Comment"] = df1["Comment"].str.lower()
df1.head()


Unnamed: 0,Comment
0,"shut up plebs, dont argue with the $cience, no..."
1,"sea level rise is already baked in, water fron..."
2,"and remember, it's your fault,with your soggy ..."
3,i never heard so much bullshit in my life…a re...
4,now what?


In [51]:
"""
#removal of punctuation
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    custom function to remove the punctuation
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df1["Comment"] = df1["Comment"].apply(lambda text: remove_punctuation(text))
df1.head()
"""


'\n#removal of punctuation\nPUNCT_TO_REMOVE = string.punctuation\ndef remove_punctuation(text):\n    custom function to remove the punctuation\n    return text.translate(str.maketrans(\'\', \'\', PUNCT_TO_REMOVE))\n\ndf1["Comment"] = df1["Comment"].apply(lambda text: remove_punctuation(text))\ndf1.head()\n'

## **Lemmatization**

In [52]:
##lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df1["Comment"] = df1["Comment"].apply(lambda text: lemmatize_words(text))
df1.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Comment
0,"shut up plebs, dont argue with the $cience, no..."
1,"sea level rise is already baked in, water fron..."
2,"and remember, it's your fault,with your soggy ..."
3,i never heard so much bullshit in my life…a re...
4,now what?


## **Convert emojis and emoticons into text**

In [53]:
import pandas as pd
import re
import unicodedata

def convert_emoticons_to_text(text):
    """
    Convert emoticons to their textual representation.

    Args:
        text (str): Input text with emoticons.

    Returns:
        str: Text with emoticons converted to text.
    """
    # Define emoticons and their corresponding textual representations
    emoticons = {
        ":)": "Happy",
        ":-)": "Happy",
        ":D": "Happy",
        ":(": "Sad",
        ":'(": "Sad",
        ":/": "Confused",
        ";)": "Wink",
        "<3": "Heart",
        ":P": "Tongue",
        ":p": "Tongue",
        ":|": "Neutral",
        ":O": "Surprise",
        ":o": "Surprise",
        ":*": "Kiss",
    }

    # Regular expression pattern for detecting emoticons
    emoticon_pattern = re.compile(u'(' + '|'.join(re.escape(emot) for emot in emoticons.keys()) + u')')

    # Replace emoticons with their textual representation
    text = emoticon_pattern.sub(lambda x: emoticons.get(x.group(), ""), text)

    # Remove any remaining emoticons
    text = re.sub(u'(' + '|'.join(re.escape(emot) for emot in emoticons.keys()) + u')', "", text)

    return text


In [54]:
!pip install emoji
import emoji

def convert_emojis_to_text(text):
    """
    Convert emojis to their textual representation.

    Args:
        text (str): Input text with emojis.

    Returns:
        str: Text with emojis converted to text.
    """
    # Use emoji library to replace emojis with their textual representation
    return emoji.demojize(text)





In [55]:
# Apply the emoticon conversion function to the 'Comment' column in df1
df1['Comment'] = df1['Comment'].apply(convert_emoticons_to_text)
# Apply the emoji conversion function to the 'Comment' column in df1
df1['Comment'] = df1['Comment'].apply(convert_emojis_to_text)

# Print the DataFrame after conversion
print(df1)

                                                 Comment
0      shut up plebs, dont argue with the $cience, no...
1      sea level rise is already baked in, water fron...
2      and remember, it's your fault,with your soggy ...
3      i never heard so much bullshit in my life…a re...
4                                              now what?
...                                                  ...
12086  in 2022, soylent green will be available. so i...
12087  the earth is warming at an alarming rate, glob...
12088  an idiot show to proof their climate change pr...
12089  kid got this:clapping_hands::clapping_hands:. ...
12090  no! your fucking geoengineering program are ~ ...

[12091 rows x 1 columns]


In [56]:
def preprocess_text(text):
    # Remove "::" and "_" sequences
    text = text.replace("::", " ")
    text = text.replace("_", " ")

    # Add whitespace after removing sequences
    text = text.replace(":", " ")  # Add whitespace after removing ":" characters

    return text

# Apply preprocessing to the 'Comment' column in your DataFrame
df1['Comment'] = df1['Comment'].apply(preprocess_text)
df1


Unnamed: 0,Comment
0,"shut up plebs, dont argue with the $cience, no..."
1,"sea level rise is already baked in, water fron..."
2,"and remember, it's your fault,with your soggy ..."
3,i never heard so much bullshit in my life…a re...
4,now what?
...,...
12086,"in 2022, soylent green will be available. so i..."
12087,"the earth is warming at an alarming rate, glob..."
12088,an idiot show to proof their climate change pr...
12089,kid got this clapping hands clapping hands . 6...


## **Remove HTML tags and URLs**

In [57]:
import re

def remove_html_tags(text):
    """
    Remove HTML tags from the text.

    Args:
        text (str): Input text with HTML tags.

    Returns:
        str: Text with HTML tags removed.
    """
    # Define HTML tag pattern
    html_tag_pattern = re.compile(r'<.*?>')

    # Remove HTML tags using regular expression
    return re.sub(html_tag_pattern, '', text)

def remove_urls(text):
    """
    Remove URLs from the text.

    Args:
        text (str): Input text with URLs.

    Returns:
        str: Text with URLs removed.
    """
    # Define URL pattern
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    # Remove URLs using regular expression
    return re.sub(url_pattern, '', text)

# Apply the functions to remove HTML tags and URLs from the 'Comment' column in your DataFrame
df1['Comment'] = df1['Comment'].apply(remove_html_tags)
df1['Comment'] = df1['Comment'].apply(remove_urls)
df1

Unnamed: 0,Comment
0,"shut up plebs, dont argue with the $cience, no..."
1,"sea level rise is already baked in, water fron..."
2,"and remember, it's your fault,with your soggy ..."
3,i never heard so much bullshit in my life…a re...
4,now what?
...,...
12086,"in 2022, soylent green will be available. so i..."
12087,"the earth is warming at an alarming rate, glob..."
12088,an idiot show to proof their climate change pr...
12089,kid got this clapping hands clapping hands . 6...


## **Chat word conversion**

In [58]:
import pandas as pd

# Custom chat words dictionary
chat_words_dict = {
    "lol": "laugh out loud",
    "brb": "be right back",
    "btw": "by the way",
    "idk": "I don't know",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "omg": "oh my god",
    "rofl": "rolling on the floor laughing",
    "ttyl": "talk to you later",
    "wtf": "what the heck",
    "afk": "away from keyboard",
    "bfn": "bye for now",
    "gtg": "got to go",
    "lmao": "laughing my ass off",
    "np": "no problem",
    "ty": "thank you",
    "yw": "you're welcome",
    "bs": "bullshit",
    "b" : "bullshit"
}

# Function to expand chat words
def expand_chat_words(text):
    words = text.split()
    expanded_text = [chat_words_dict.get(word.lower(), word) for word in words]
    return ' '.join(expanded_text)


# Apply the expand_chat_words function to the 'Comment' column in df1
df1['Comment'] = df1['Comment'].apply(expand_chat_words)

# Print the DataFrame after conversion
print(df1)



                                                 Comment
0      shut up plebs, dont argue with the $cience, no...
1      sea level rise is already baked in, water fron...
2      and remember, it's your fault,with your soggy ...
3      i never heard so much bullshit in my life…a re...
4                                              now what?
...                                                  ...
12086  in 2022, soylent green will be available. so i...
12087  the earth is warming at an alarming rate, glob...
12088  an idiot show to proof their climate change pr...
12089  kid got this clapping hands clapping hands . 6...
12090  no! your fucking geoengineering program are ~ ...

[12091 rows x 1 columns]


## **Spelling Correction**

In [None]:
!pip install autocorrect
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
from autocorrect import Speller

# Initialize the spell checker
spell = Speller(lang='en')

# Define the function to correct spellings
def correct_spellings(text):
    return spell(text)

# Define a function to process each batch
def process_batch(batch):
    return batch['Comment'].apply(correct_spellings)

# Define the number of processes to use
num_processes = 4  # You can adjust this based on your system's resources

# Split the DataFrame into smaller batches
batch_size = len(df1) // num_processes
batches = [df1.iloc[i:i+batch_size] for i in range(0, len(df1), batch_size)]

# Process batches in parallel
with ProcessPoolExecutor(max_workers=num_processes) as executor:
    results = executor.map(process_batch, batches)

# Combine the results
df_result = pd.concat(results)

# Update the 'Comment' column in df1 with the corrected values
df1['Comment'] = df_result

# Print the DataFrame after correction
print(df1)




                                                 Comment
0      shut up pleas, dont argue with the $science, n...
1      sea level rise is already baked in, water fron...
2      and remember, it's your fault,with your foggy ...
3      i never heard so much bullshit in my life…a re...
4                                              now what?
...                                                  ...
12086  in 2022, silent green will be available. so i ...
12087  the earth is warming at an alarming rate, glob...
12088  an idiot show to proof their climate change pr...
12089  kid got this clipping hands clipping hands . 6...
12090  no! your fucking geoengineering program are ~ ...

[12091 rows x 1 columns]


In [None]:
df1 = df1[df1['Comment'].notna()]

# Reset the index
df1.reset_index(drop=True, inplace=True)

# Print the DataFrame after modification
df1


Unnamed: 0,Comment
0,"shut up pleas, dont argue with the $science, n..."
1,"sea level rise is already baked in, water fron..."
2,"and remember, it's your fault,with your foggy ..."
3,i never heard so much bullshit in my life…a re...
4,now what?
...,...
12086,"in 2022, silent green will be available. so i ..."
12087,"the earth is warming at an alarming rate, glob..."
12088,an idiot show to proof their climate change pr...
12089,kid got this clipping hands clipping hands . 6...


## **Downloading the preprocessed dataset**

In [None]:

file_path = "/content/drive/MyDrive/Pre_all_comments.csv"

# Save DataFrame as CSV
df1.to_csv(file_path, index=False)