In [None]:
import pandas as pd
import string 
import re 

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer 
import nltk
nltk.download('english')
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")

In [None]:
file = 'Data/data.csv'
dataframe = pd.read_csv(file, error_bad_lines=False, sep=";")

# Data cleaning 

In [None]:
def data_cleaning(df):
    '''
        df : DataFrame 
    '''
    
    # Delete IDs
    df.drop('ID', axis=1, inplace=True)
    
    # First encoding 
    df['Label'].replace('none', 'not racist', inplace=True)
    df['Label'].replace('racism', 'racist', inplace=True)
    
    i = 0 
    for i in range(df['Tweets'].shape[0]):
        # Remove ids @ 
        df['Tweets'][i] = re.sub(r'@\S+', '', df['Tweets'][i])
        
        # Remove punctuation
        df['Tweets'][i] = "".join([char for char in df['Tweets'][i] if char not in string.punctuation])
        
        # Uppercase -> Lowercase 
        df['Tweets'][i] = df['Tweets'][i].lower()
        
        # Delete Url 
        df['Tweets'][i] = re.sub(r'http\S+', '', df['Tweets'][i])
        
        
        # Delete characters 
        df['Tweets'][i] = re.sub("ð|ÿ|‘|œ|¦|€|˜|™|¸|¤|‚|©|¡|…|”|“|‹|š|±|³|iâ|§|„|", '', df['Tweets'][i]) 
        
    return df
        
        

In [None]:
df_cleaned = data_cleaning(dataframe)

# Tokenization

In [None]:
def tokenization(df):
    df['Tweets'] = df.apply(lambda row: nltk.word_tokenize(row['Tweets']), axis=1)
    return df

In [None]:
dataframe = tokenization(df_cleaned)

# lemmatization with POS Tag

In [None]:
lemmatizer = WordNetLemmatizer()
def lemma(df):
    df['Tweets']= lemmatizer.lemmatize((w, get_wordnet_pos(w)) for w in df['Tweets'])
    return df

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [25]:
dataframe

Unnamed: 0,Tweets,Label
0,"[this, video, of, the, peshmerga, decimating, ...",not racist
1,"[oh, really, no, more, instant, restaurants, t...",not racist
2,"[rt, it, hasnt, been, a, good, few, weeks, for...",not racist
3,"[rt, i, donât, need, femisnsn, because, men, c...",not racist
4,"[19, is, not, the, vast, majority]",not racist
...,...,...
16044,"[rt, i, want, equal, rights, but, i, still, wa...",sexism
16045,"[rt, go, ahead, and, call, me, sexist, but, sc...",sexism
16046,"[ive, had, the, epic, but, i, always, kept, it...",not racist
16047,"[so, do, you, think, that, the, daesh, are, pl...",not racist


In [26]:
from nltk.corpus import wordnet

In [32]:
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in dataframe['Tweets'][16048]])

['rt', 'my', 'skin', 'green', 'no', 'color', 'suit', 'only', 'wear', 'ripped', 'pant', 'because', 'hulk', 'need', 'no', 'clothes']
