In [1]:
import pandas as pd
import string 
import re 

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer 
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import warnings
nltk.download('wordnet')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zakof\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
file = 'Data/data.csv'
dataframe = pd.read_csv(file, error_bad_lines=False, sep=";")

# Data cleaning 

In [3]:
def data_cleaning(df):
    '''
        df : DataFrame 
    '''
    
    # Delete IDs
    df.drop('ID', axis=1, inplace=True)
    
    # First encoding 
    df['Label'].replace('none', 'not racist', inplace=True)
    df['Label'].replace('racism', 'racist', inplace=True)
    
    i = 0 
    for i in range(df['Tweets'].shape[0]):
        # Remove ids @ 
        df['Tweets'][i] = re.sub(r'@\S+', '', df['Tweets'][i])
        
        # Remove punctuation
        df['Tweets'][i] = "".join([char for char in df['Tweets'][i] if char not in string.punctuation])
        
        # Uppercase -> Lowercase 
        df['Tweets'][i] = df['Tweets'][i].lower()
        
        # Delete Url 
        df['Tweets'][i] = re.sub(r'http\S+', '', df['Tweets'][i])
        
        
        # Delete characters 
        df['Tweets'][i] = re.sub("ð|ÿ|‘|œ|¦|€|˜|™|¸|¤|‚|©|¡|…|”|“|‹|š|±|³|iâ|§|„|", '', df['Tweets'][i]) 
        
    return df
        
        

# Tokenization

In [4]:
def tokenization(df):
    df['Tweets'] = df.apply(lambda row: nltk.word_tokenize(row['Tweets']), axis=1)
    return df

# Lemmatization with POS

In [5]:
lemmatizer = WordNetLemmatizer()
#function to tag evry token with the corresponding POS
def get_wordnet_pos(word):
            """Map POS tag to first character lemmatize() accepts"""
            tag = nltk.pos_tag([word])[0][1][0].upper()
            tag_dict = {"J": wordnet.ADJ,
                        "N": wordnet.NOUN,
                        "V": wordnet.VERB,
                        "R": wordnet.ADV}

            return tag_dict.get(tag, wordnet.NOUN)

# Preprocessing

In [6]:
def Preprocessing(df):
    #data cleaning 
    cleaned_data = data_cleaning(df)
    
    #tokenization 
    tokenazed_data = tokenization(cleaned_data)
    
    #lemmatization and remooving stop words
    i = 0
    for i in range(tokenazed_data['Tweets'].shape[0]):
        tokenazed_data['Tweets'][i] = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokenazed_data['Tweets'][i]]
    #remove stop words 
    
    
    return tokenazed_data

In [7]:
Preprocessing(dataframe)

Unnamed: 0,Tweets,Label
0,"[this, video, of, the, peshmerga, decimate, is...",not racist
1,"[oh, really, no, more, instant, restaurant, th...",not racist
2,"[rt, it, hasnt, be, a, good, few, week, for, i...",not racist
3,"[rt, i, donât, need, femisnsn, because, men, c...",not racist
4,"[19, be, not, the, vast, majority]",not racist
...,...,...
16044,"[rt, i, want, equal, right, but, i, still, wan...",sexism
16045,"[rt, go, ahead, and, call, me, sexist, but, sc...",sexism
16046,"[ive, have, the, epic, but, i, always, kept, i...",not racist
16047,"[so, do, you, think, that, the, daesh, be, pla...",not racist
