# Evaluation of suggested Models 

## Libraries 

In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Data set & Preprocessing 

In [4]:
#file = 'Data/data.csv'
file = 'Data/discrimant_tweet_47368_dataset.csv'
dataframe = pd.read_csv(file, error_bad_lines=False, sep=";")
dataframe = dataframe[['Tweets','Label']]
dataframe.head()

Unnamed: 0,Tweets,Label
0,@azzamalirhabi @JihadiA8 This video of the Pes...,0.0
1,Oh really? No more instant restaurants? THAT'S...,0.0
2,RT @Benfrancisallen: It hasn't been a good few...,0.0
3,RT @NoToFeminism: I donâ€™t need femisnsn beca...,0.0
4,@MariachiMacabre 19% is not the vast majority,0.0


In [5]:
print("Data set's shape = ", dataframe.shape) 
print("There is {} missing values in the target".format(dataframe['Label'].isnull().sum()))

Data set's shape =  (44123, 2)
There is 24783 missing values in the target


In [31]:
dataframe['Label'].value_counts()

0.0    13401
1.0     5939
Name: Label, dtype: int64

In [6]:
# Data set for a supervised learning 
df = dataframe[dataframe['Label'].isnull()==False]

In [7]:
print("Data set's shape = ", df.shape) 
print("There is {} missing values in the target".format(df['Label'].isnull().sum()))

Data set's shape =  (19340, 2)
There is 0 missing values in the target


In [19]:
def encoding_target(y):
    '''
        y : Series 
    '''
    
    for i in range(y.shape[0]):
        if y[i] == 'sexism' or y[i] == 'racism':
            y[i] = 1
        else:
            y[i] = 0
    return y.astype('int')

def data_cleaning(df):
    '''
        df : DataFrame 
    '''
    
    import re 
    import string
    
    
    i = 0 
    for i in range(df['Tweets'].shape[0]):
        # Remove ids @ 
        df['Tweets'][i] = re.sub(r'@\S+', '', df['Tweets'][i])
        
        # Remove punctuation
        df['Tweets'][i] = "".join([char for char in df['Tweets'][i] if char not in string.punctuation])
        
        # Uppercase -> Lowercase 
        df['Tweets'][i] = df['Tweets'][i].lower()
        
        # Delete Url 
        df['Tweets'][i] = re.sub(r'http\S+', '', df['Tweets'][i])
        
        # Delete characters 
        df['Tweets'][i] = re.sub("ð|ÿ|‘|œ|¦|€|˜|™|¸|¤|‚|©|¡|…|”|“|‹|š|±|³|iâ|§|„|", '', df['Tweets'][i]) 
        
    return df

def tokenization(df):
    '''
        df : DataFrame 
    '''
    
    # Generate tokens
    from nltk.tokenize import TweetTokenizer
    tknz = TweetTokenizer()
    tokens = []
    
    i = 0
    for i in range(df['Label'].shape[0]):
        tokens.extend(tknz.tokenize(df['Tweets'][i]))
    
    return tokens

def stemming(tokens):
    '''
        tokens : list 
    '''
    
    from nltk.stem import PorterStemmer
    stemming = PorterStemmer()
    
    for token in tokens:
        token = stemming.stem(token)
    return tokens

def tokens_frequencies(tokens):
    '''
        tokens : list 
    ''' 
    # Creation of a dataframe Tokens-Frequencies
    from nltk.probability import FreqDist
    fdist = FreqDist()
    
    for token in tokens:
        fdist[token] += 1 
    tokens_freq = pd.DataFrame(list(fdist.items()), columns = ["Tokens","Frequencies"])
    
    # Sort the dataframe according to frequency of words
    tokens_freq.sort_values(by='Frequencies',ascending=False, inplace=True)
    
    return tokens_freq

def stop_words(df):
    '''
        df : DataFrame
    '''
    from nltk.corpus import stopwords
    
    liste = []
    i = 0 
    for i in range(df.shape[0]):
        if df['Tokens'][i] not in stopwords.words('english'):
            liste.append([df['Tokens'][i],df['Frequencies'][i]])
    return pd.DataFrame(liste,columns=["Tokens","Frequencies"])

def bag_of_words(df, nbr_tokens, token_frequency):
    from nltk.tokenize import TweetTokenizer 

    # Most frequent tokens
    most_freq = token_frequency['Tokens'][:nbr_tokens]

    # Vectorization 
    matrix = []
    for tweet in df['Tweets']:
        vector = []
        tknz = TweetTokenizer()
        tweet = tknz.tokenize(tweet)
        for token in most_freq:
            if token in tweet:
                vector.append(1)
            else:
                vector.append(0)
        matrix.append(vector)

    # Convert the matrix into a dataframe
    matrix = pd.DataFrame(matrix, columns=most_freq)

    return matrix

def tfidf(df, nbr_tokens, token_frequency, ngram):
    from tqdm import tqdm
    from sklearn.feature_extraction.text import TfidfVectorizer


    #tf_idf = TfidfVectorizer(ngram_range=ngram,
                         #binary=True,
                         #smooth_idf=False, 
                         #max_features=nbr_tokens )#nbr_tokens)
    tf_idf = TfidfVectorizer(max_features=nbr_tokens,
                             binary=True,
                             smooth_idf=False,
                             min_df=5, 
                             max_df=0.7, 
                                )

    return tf_idf.fit_transform(np.array(df['Tweets'])).toarray()

def hashing(df, nbr_tokens):

    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(n_features=nbr_tokens)
    
    return vectorizer.transform(df['Tweets']).toarray()

def word2vec(df, nbr_tokens, token_frequency):
    pass 


def vectorization(df, nbr_tokens, token_frequency, method):
    '''
        df : DataFrame 
        nbr_tokens : int - the number of tokens from the token-frequency DataFrame  
        token_frequency : DataFrame - the array that contains the frequency of each token 
    '''
    
    if method == "bow":
        return bag_of_words(df, nbr_tokens, token_frequency)  

    elif method == "tfidf":
        return tfidf(df, nbr_tokens, token_frequency, ngram=(1,1))
        
    elif method == "hashing":
        return hashing(df, nbr_tokens) 
    
    elif method == "word2vec":
        pass 

In [20]:
def preprocessing_1(dataset, nbr_tokens, vectorizer):
    '''
        dataset : DataFrame - the raw data set 
        nbr_tokens : int - the number of tokens from the token-frequency DataFrame 
        nbr_tweets : int - the number of tweets to vectorize 
    '''

    # Copy the dataset
    df = dataset.copy()
    y = df['Label']
    
    
    # manipulations
    df_cleaned = data_cleaning(df)
    
    # tokenization
    tokens = tokenization(df_cleaned)
    
    # stemming
    tokens_stemmed = stemming(tokens)
    
    # tokens_frequencies 
    tokfreq = tokens_frequencies(tokens_stemmed)
    
    # Stop words 
    tokfreq = stop_words(tokfreq)
    
    # Generate a CSV file for Tokens-Frequencies
    tokfreq.to_csv("Word-Frenquency.csv")
    
    # vectorization
    X = vectorization(df, nbr_tokens, tokfreq, vectorizer)
    
    # Encoding target 
    #y = encoding_target(y)
    print(y.value_counts())

    # Split the data : Train set & Test set 
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)
    
    return pd.DataFrame(X_train), pd.DataFrame(X_test), pd.DataFrame(y_train), pd.DataFrame(y_test)

In [21]:
%%time 
X_train, X_test, y_train, y_test = preprocessing_1(df[:10000], 100, "tfidf")

0.0    6842
1.0    3158
Name: Label, dtype: int64
CPU times: user 10.7 s, sys: 593 ms, total: 11.3 s
Wall time: 11.3 s


In [22]:
X_train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.490747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.411156,0.0,0.0


## Accuracy of Models 

**Support Vector Machine (SVM)** 

In [2]:
def support_vector_machine(X_train, X_test, y_train, y_test):
    # training 
    from sklearn.svm import SVC 
    model = SVC(kernel='linear', random_state=0)
    model.fit(X_train, y_train)
    
    # prediction 
    y_pred = model.predict(X_test)
    
    # evaluation 
    from sklearn.metrics import accuracy_score #, classification_report
    return accuracy_score(y_test, y_pred)

In [28]:
%%time
print("Accuracy = ", support_vector_machine(X_train, X_test, y_train, y_test))

Accuracy =  0.7846666666666666
CPU times: user 4.78 s, sys: 60.6 ms, total: 4.84 s
Wall time: 4.86 s


**Convolution Neural Network (CNN)** 

In [25]:
def convolution_neural_network(X_train, X_test, y_train, y_test):
    from keras.models import Sequential
    from keras.layers import Dense

    model = Sequential()
    model.add(Dense(8, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(4, activation='relu'))
    model.add(Dense(1, input_dim=X_train.shape[1], activation='sigmoid'))
    #print(model.summary())
    model.compile(loss='binary_crossentropy', optimizer='rmsprop',metrics=['accuracy'])
    
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=4, verbose=1)
    y_pred = model.predict(X_test)


    # evaluation 
    from sklearn.metrics import accuracy_score #, classification_report
    return accuracy_score(y_test, y_pred.round()) # round because y_pred is a vector of probabilites 

In [29]:
%%time 
print("Accuracy = ", convolution_neural_network(X_train, X_test, y_train, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy =  0.7716666666666666
CPU times: user 2.02 s, sys: 220 ms, total: 2.24 s
Wall time: 1.64 s


**Reccurent Neural Network (RNN) : Long Short-Term Memory (LSTM)**

In [153]:
def long_short_term_memory(X_train, X_test, y_train, y_test):
    from keras.models import Sequential
    from keras.layers import Dense, LSTM
    
    model = Sequential()
    model.add(LSTM(32, batch_input_shape=(32,1,X_train.shape[1]), return_sequences=False))
    model.add(Dense(1))
    print(model.summary())
    model.compile(loss='binary_crossentropy', optimizer='rmsprop',metrics=['accuracy'])
    
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=4)
    y_pred = model.predict(X_test)
    
    
    # evaluation 
    from sklearn.metrics import accuracy_score 
    return accuracy_score(y_test, y_pred.round()) 

In [30]:
#long_short_term_memory(X_train, X_test, y_train, y_test)