# Imports 

In [1]:
import pandas as pd 

from sklearn.base import BaseEstimator, TransformerMixin
import string # punct
import emoji # for emoji
import csv # for slang
import re # regex
from nltk.corpus import stopwords # stopwords
from nltk.stem import PorterStemmer # stemming

# Load Data

In [2]:
df = pd.read_csv(r"C:\Users\HP\Documents\Year I Semester II\SL\Project\Group 7 Pilot Dataset.csv")
df = df[['text', 'label']]

# Clean Text Class

In [3]:
class CleanText(BaseEstimator, TransformerMixin):
    
    def remove_mentions(self, input_text):
        '''
        Remove mentions, like @Mplamplampla
        '''
        return re.sub(r'@+', '', input_text)
    
    def remove_urls(self, input_text):
        '''
        Remove the urls mention in a tweet
        '''
        input_text  = ' '.join([w for w in input_text.split(' ') if '.com' not in w])
        return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        input_text = emoji.demojize(input_text)
        input_text = input_text.replace('_','')
        input_text = input_text.replace(':','')
        return input_text
    
    def possessive_pronouns(self, input_text):
        '''
        Remove the possesive pronouns, because otherwise after tokenization we will end up with a word and an s
        Example: government's --> ["government", "s"]
        '''
        return input_text.replace("'s", "")
    
    def characters(self, input_text):
        '''
        Remove special and redundant characters that may appear on a tweet and that don't really help in our analysis
        '''
        input_text = input_text.replace("\r", " ") # Carriage Return
        input_text = input_text.replace("\n", " ") # Newline
        input_text = " ".join(input_text.split()) # Double space
        input_text = input_text.replace('"', '') # Quotes
        return input_text
    
    def remove_punctuation(self, input_text):
        '''
        Remove punctuation and specifically these symbols '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        '''
        punct = string.punctuation # string with all the punctuation symbols '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)
    
    def remove_digits(self, input_text):
        '''
        Remove numbers
        '''
        return re.sub('\d+', '', input_text)
    
    def to_lower(self, input_text):
        '''
        Convert all the sentences(words) to lowercase
        '''
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        '''
        Remove stopwords (refers to the most common words in a language)
        '''
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def stemming(self, input_text):
        '''
        Reduce the words to their stem
        '''
        porter = PorterStemmer()
        words = input_text.split() 
        stemmed_words = [porter.stem(word) for word in words]
        return " ".join(stemmed_words)
    
    def encode_decode(self, input_text):
        '''
        Remove weird characters that are result of encoding problems
        '''
        return  " ".join([k.encode("ascii", "ignore").decode() for k in input_text.split(" ")])
    
    
    def translator(self, input_text):
        '''
        Transform abbrevations to normal words
        Example: asap --> as soon as possible
        '''
        input_text = input_text.split(" ")
        j = 0
        for _str in input_text:
            # File path which consists of Abbreviations.
            fileName = r"C:\Users\HP\Documents\Year I Semester II\SL\Project\StatLearnProj-master\Iason\slang.txt"
            # File Access mode [Read Mode]
            accessMode = "r"
            with open(fileName, accessMode) as myCSVfile:
                # Reading file as CSV with delimiter as "=", so that abbreviation are stored in row[0] and phrases in row[1]
                dataFromFile = csv.reader(myCSVfile, delimiter="=")
                # Removing Special Characters.
                _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
                for row in dataFromFile:
                    # Check if selected word matches short forms[LHS] in text file.
                    if _str.upper() == row[0]:
                        # If match found replace it with its appropriate phrase in text file.
                        input_text[j] = row[1]
                myCSVfile.close()
            j = j + 1
        
        return(' '.join(input_text))
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.translator).apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.possessive_pronouns).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.encode_decode).apply(self.characters).apply(self.to_lower).apply(self.remove_stopwords).apply(self.stemming)
        return clean_X

# Example

In [4]:
text = "I just wanted to say @Aladdin, you're so cute ❤️. I would suggest you go to www.diamondintherough.com and fill an \napplication there asap â€ !!! You are 100% the best guy for Jasmine's heart "
print(text)
yo = CleanText()

I just wanted to say @Aladdin, you're so cute ❤️. I would suggest you go to www.diamondintherough.com and fill an 
application there asap â€ !!! You are 100% the best guy for Jasmine's heart 


#### First let's eliminate abreviations (here "asap")

In [5]:
text = yo.translator(text)
print(text)

I just wanted to say @Aladdin, you're so cute ❤️. I would suggest you go to www.diamondintherough.com and fill an 
application there As Soon As Possible â€ !!! You are 100% the best guy for Jasmine's heart 


#### Let's eliminate mentions (here "@Aladdin")

In [6]:
text = yo.remove_mentions(text)
print(text)

I just wanted to say Aladdin, you're so cute ❤️. I would suggest you go to www.diamondintherough.com and fill an 
application there As Soon As Possible â€ !!! You are 100% the best guy for Jasmine's heart 


#### Let's eliminate urls (here "www.diamondintherough.com")

In [7]:
text = yo.remove_urls(text)
print(text)

I just wanted to say Aladdin, you're so cute ❤️. I would suggest you go to and fill an 
application there As Soon As Possible â€ !!! You are 100% the best guy for Jasmine's heart 


#### Let's eliminate emojis (here "❤️")

In [8]:
text = yo.emoji_oneword(text)
print(text)

I just wanted to say Aladdin, you're so cute redheart. I would suggest you go to and fill an 
application there As Soon As Possible â€ !!! You are 100% the best guy for Jasmine's heart 


#### Let's eliminate possesive pronouns (here "Jasmine's")

In [9]:
text = yo.possessive_pronouns(text)
print(text)

I just wanted to say Aladdin, you're so cute redheart. I would suggest you go to and fill an 
application there As Soon As Possible â€ !!! You are 100% the best guy for Jasmine heart 


#### Let's eliminate punctuation (here ",", "'", ".", "!!!", "%")

In [10]:
text = yo.remove_punctuation(text)
print(text)

I just wanted to say Aladdin  you re so cute redheart  I would suggest you go to and fill an 
application there As Soon As Possible â€     You are 100  the best guy for Jasmine heart 


#### Let's eliminate digits (here "100")

In [11]:
text = yo.remove_digits(text)
print(text)

I just wanted to say Aladdin  you re so cute redheart  I would suggest you go to and fill an 
application there As Soon As Possible â€     You are   the best guy for Jasmine heart 


#### Let's eliminate ascii weirs characters (here "â€")

In [12]:
text = yo.encode_decode(text)
print(text)

I just wanted to say Aladdin  you re so cute redheart  I would suggest you go to and fill an 
application there As Soon As Possible      You are   the best guy for Jasmine heart 


#### Let's eliminate special charachers (here "\n", you can see that after the analysis the new line doesn't start with application. Also remove excess whitespace)

In [13]:
text = yo.characters(text)
print(text)

I just wanted to say Aladdin you re so cute redheart I would suggest you go to and fill an application there As Soon As Possible You are the best guy for Jasmine heart


#### Let's transform all the words to lowercase

In [14]:
text = yo.to_lower(text)
print(text)

i just wanted to say aladdin you re so cute redheart i would suggest you go to and fill an application there as soon as possible you are the best guy for jasmine heart


#### Let's eliminate stopwords (here "i", "just", "to", "you", "re","so", "you", "and", "an", "there", "as", "are", "the", "for" )

In [15]:
text = yo.remove_stopwords(text)
print(text)

wanted say aladdin cute redheart would suggest go fill application soon possible best guy jasmine heart


#### Let's keep just the stem 

In [16]:
text = yo.stemming(text)
print(text)

want say aladdin cute redheart would suggest go fill applic soon possibl best guy jasmin heart


# Apply the class on the data

In [None]:
ct = CleanText()
sr_clean = ct.fit_transform(df.text)

# Our final DataFrame

In [None]:
dopo_eda = pd.DataFrame()
dopo_eda["text"] = sr_clean
dopo_eda["label"] = df.label
dopo_eda.head()

#### A check, just in case after the pre-processing eliminated everything :p

In [None]:
empty_clean = dopo_eda.text == ''
print('{} records have no words left after text cleaning'.format(sr_clean[empty_clean].count()))
print("Before")
print(df[empty_clean])
print("After")
print(sr_clean[empty_clean])

#### Since there's nothing left we'll drop them and reindex

In [None]:
dopo_eda.drop([2062, 3087], inplace = True)
dopo_eda.reset_index(inplace = True)
dopo_eda.drop("index", axis = 1, inplace = True)

In [None]:
dopo_eda.head()

# And now we save

In [None]:
dopo_eda.to_csv("dopo_eda.csv")