In [11]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
from tqdm import tqdm
import operator
import re
import string
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [12]:
def text_preprocessing(x):
    for s in ['#', '@', '_', '.', '?', '!', '*', '~', '%', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=>']:
        x = x.replace(s, '')
    x = x.replace('-', ' ')
    return x

# left only words that can be found in dictionary
def text_remove_unknown_words(x, model):
#     x = x.split(' ')
    x_ = list()
    for word in x:
        try:
            v = model[word]
            x_.append(word)
        except:
            continue
    return x_

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def n_punctuations(text):
    count = 0
    for s in ['!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']:
        if s in text:
            c = text.count(s)
            count+=c
    return count

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punctuation(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"i'll", "i will", text)
    text = re.sub(r"she'll", "she will", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"[^a-z]", " ", text) # This removes anything other than lower case letters(very imp)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    return text


lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    try:
        pos_tagged_text = nltk.pos_tag(text.split())
        return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
    except:
        return text

In [14]:
def data_preprocessing(df, model,create_embedding = True, remove_punct = True, min_len = 3, remove_unknown_words = True, small_word = 3,
                      replace_nan = True):
    df['copy_text'] = df['text'].str.lower()
    df['is_question'] = df['copy_text'].apply(lambda x: 1 if '?' in x else 0)
    df['n_punctuation'] = df['copy_text'].apply(n_punctuations)
    df['n_hashtag'] = df['copy_text'].apply(lambda x: x.count('#') if type(x) == str else 0)
    # remove url and html
    df['copy_text'] = df['copy_text'].apply(lemmatize_words)
    df['copy_text'] = df['copy_text'].apply(clean_text)
    df['copy_text'] = df['copy_text'].apply(remove_URL)
    df['copy_text'] = df['copy_text'].apply(remove_html)
    df['copy_text'] = df['copy_text'].apply(remove_emoji)
    df['copy_text'] = df['copy_text'].apply(remove_punctuation)
#     df['copy_text'] = df['copy_text'].apply(correct_spellings)
#     print('5')
    if remove_punct:
        df['copy_text'] = df['copy_text'].apply(text_preprocessing)
    df['copy_text'] = df['copy_text'].str.split(' ')
    # remove small words
    df['n_small_words'] = df['copy_text'].apply(lambda x: len([w for w in x if len(w) <= small_word]))
    df['copy_text'] = df['copy_text'].apply(lambda x: [w for w in x if len(w) >=min_len])
    # remove unknown words
    if remove_unknown_words:
        df['copy_text'] = df['copy_text'].apply(text_remove_unknown_words, args = (model,))
    # create additional features
    df['n_words'] = df['copy_text'].apply(lambda x: len(x))
    df['mean_len'] = df['copy_text'].apply(lambda x: np.mean([len(w) for w in x]))
    df['std_len'] = df['copy_text'].apply(lambda x: np.std([len(w) for w in x]))
    if create_embedding:
        # create word embedding
        df['vector_text'] = df['copy_text'].apply(lambda x: [model[word] for word in x])
        # get mean of all words
        df['mean_vector_text'] = df['vector_text'].apply(lambda x: sum(x) / len(x) if len(x) != 0 else np.nan)
        # create dataframe
        features_df = df[['id','mean_vector_text', 'keyword','n_words','n_small_words', 'mean_len',
                      'std_len', 'location','n_hashtag','is_question','n_punctuation', 'target', 'split', 'text']]
        features_df.loc[pd.isnull(features_df['mean_vector_text']) , 'mean_vector_text'] = np.nan
        features_df['mean_vector_text'] = features_df['mean_vector_text'].apply(lambda x:
                                                x if type(x) == np.ndarray else np.zeros(300))
        f_name = 'v_'
        i = 0
        for i in tqdm(range(300)):
            features_df[f_name + str(i)] = [x[i] for x in features_df['mean_vector_text']]
         # preprocessing on 'keyword' column
        features_df['main_keyword'] = features_df['keyword'].str.split('%20').apply(lambda x: x[0] if type(x) == list else x)
        features_df['additional_keyword'] = features_df['keyword'].apply(lambda x:
                                                                re.findall(r'%(\w+)', x) if type(x) == str else ['Not_given'])
        features_df['additional_keyword'] = features_df['additional_keyword'].apply(lambda x: x[0] if len(x) > 0 else 'Not_given')
        if replace_nan:
            features_df['main_keyword'] = features_df['main_keyword'].fillna('Not given')
            features_df['keyword'] = features_df['keyword'].fillna('Not given')
            features_df['location'] = features_df['location'].fillna('Not given')
        # is main_keyword in text
        features_df['is_keywords_in_text'] = features_df[['main_keyword', 'text']].apply(lambda x:
                                                        1 if x['main_keyword'] in x['text'].lower() else 0, axis = 1)
        return features_df
    return df

In [6]:
# prepr_df = data_preprocessing(df)

NameError: name 'df' is not defined