In [109]:
import pandas as pd
import numpy as np
import re
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghckd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [169]:
class PreProcessor:
    def __init__(self, folder_path:str="../data"):
        self.train = pd.read_csv(f"{folder_path}/train.csv")
        self.test = pd.read_csv(f"{folder_path}/test.csv")
        
    def data_preprocessor(self):
        train = self.train
        test = self.test

        train["text"] = train["text"].apply(self.text_cleaner)
        test["text"] = test["text"].apply(self.text_cleaner)
        
        train = self.text_tr_processor(train)
        
        train["keyword"] = train["keyword"].fillna("")
        train["keyword_split"] = train["keyword"].apply(self.keyword_preprocessor)  
        train["keyword_split"] = train["keyword_split"].apply(self.lemmatizer)      

        test["keyword"] = test["keyword"].fillna("")
        test["keyword_split"] = test["keyword"].apply(self.keyword_preprocessor)  
        test["keyword_split"] = test["keyword_split"].apply(self.lemmatizer)      
                        
        for i in train.index:
            train.loc[i, "text&keyword"] = train.loc[i, "text"] + " " + " ".join(train.loc[i, "keyword_split"])
        
        for i in test.index:
            test.loc[i, "text&keyword"] = test.loc[i, "text"] + " " + " ".join(test.loc[i, "keyword_split"])
        
        train = train.drop(columns=["id", "keyword", "location", "text", "keyword_split"])
        test = test.drop(columns=["id", "keyword", "location", "text", "keyword_split"])

        
        return train, test
    
    def text_cleaner(self, text:str):
        if text is not np.nan:
            text = text.lower()
            text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
            stop = stopwords.words("english")
            text = " ".join([word for word in text.split() if word not in (stop)])
        
        return text

    def text_tr_processor(self, train_set):
        dup_texts = train_set[train_set.duplicated(["keyword", "text"], keep="first")]["text"].unique()
        drop_indexs = []
        for t in dup_texts:
            dup = train_set[train_set["text"] == t]
            if dup["target"].nunique() != 1:          
                rate_1 = train_set.loc[dup.index]["target"].sum()/len(train_set.loc[dup.index]["target"])
                if rate_1 < 0.5:
                    for i in dup.index:
                        train_set.loc[i, "target"] = 0
                elif rate_1 > 0.5:
                    for i in dup.index:
                        train_set.loc[i, "target"] = 1
                elif rate_1 == 0.5:
                    drop_indexs += list(dup.index)
        
        train_set = train_set.drop(index=drop_indexs)
        train_set = train_set.drop_duplicates()
        return train_set

    def keyword_preprocessor(self, x:str):
        # if "hellfire" in x:
        #     x = "%20".join(("hell", "fire"))
        # if "wildfire" in x:
        #     x = "%20".join(("wild", "fire"))
        
        if "%20" in x:
            x = x.split("%20")
        else:
            x = [x]
    
        return x
    
    def lemmatizer(self, word_list):
        lemmatizer = nltk.wordnet.WordNetLemmatizer()
        for i, word in enumerate(word_list):
            word_list[i] = lemmatizer.lemmatize(word)
            word_list[i] = lemmatizer.lemmatize(word_list[i], "v")
            word_list[i] = lemmatizer.lemmatize(word_list[i], pos="a")
        return word_list

In [170]:
pp = PreProcessor()
train, test = pp.data_preprocessor()