# Vectorization Methods 

## Libraries 

In [82]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Data set and Preprocessing 

In [83]:
#file = 'Data/data.csv'
file = 'Data/discrimant_tweet_47368_dataset.csv'
dataframe = pd.read_csv(file, error_bad_lines=False, sep=";")
dataframe = dataframe[['Tweets','Label']]
dataframe.head()

Unnamed: 0,Tweets,Label
0,@azzamalirhabi @JihadiA8 This video of the Peshmerga decimating ISIS is far more interesting. ht...,0.0
1,Oh really? No more instant restaurants? THAT'S SHOCKING. #MKR #MKR2015,0.0
2,RT @Benfrancisallen: It hasn't been a good few weeks for #ISIS. A new front has opened up in #Si...,0.0
3,RT @NoToFeminism: I donâ€™t need femisnsn because men carry heavy things that i cannot!!! like s...,0.0
4,@MariachiMacabre 19% is not the vast majority,0.0


In [84]:
# Data set for a supervised learning 
df = dataframe[dataframe['Label'].isnull()==False]

print("Data set's shape = ", df.shape) 
print("There is {} missing values in the target".format(df['Label'].isnull().sum()))

Data set's shape =  (19340, 2)
There is 0 missing values in the target


In [85]:
def encoding_target(y):
    '''
        y : Series 
    '''
    
    for i in range(y.shape[0]):
        if y[i] == 'sexism' or y[i] == 'racism':
            y[i] = 1
        else:
            y[i] = 0
    return y.astype('int')

def data_cleaning(df):
    '''
        df : DataFrame 
    '''
    
    import re 
    import string
    
    
    i = 0 
    for i in range(df['Tweets'].shape[0]):
        # Remove ids @ 
        df['Tweets'][i] = re.sub(r'@\S+', '', df['Tweets'][i])
        
        # Remove punctuation
        df['Tweets'][i] = "".join([char for char in df['Tweets'][i] if char not in string.punctuation])
        
        # Uppercase -> Lowercase 
        df['Tweets'][i] = df['Tweets'][i].lower()
        
        # Delete Url 
        df['Tweets'][i] = re.sub(r'http\S+', '', df['Tweets'][i])
        
        # Delete characters 
        df['Tweets'][i] = re.sub("ð|ÿ|‘|œ|¦|€|˜|™|¸|¤|‚|©|¡|…|”|“|‹|š|±|³|iâ|§|„|", '', df['Tweets'][i]) 
        
    return df

def tokenization(df):
    '''
        df : DataFrame 
    '''
    
    # Generate tokens
    from nltk.tokenize import TweetTokenizer
    tknz = TweetTokenizer()
    tokens = []
    
    i = 0
    for i in range(df['Label'].shape[0]):
        tokens.extend(tknz.tokenize(df['Tweets'][i]))
    
    return tokens

def stemming(tokens):
    '''
        tokens : list 
    '''
    
    from nltk.stem import PorterStemmer
    stemming = PorterStemmer()
    
    for token in tokens:
        token = stemming.stem(token)
    return tokens

def tokens_frequencies(tokens):
    '''
        tokens : list 
    ''' 
    # Creation of a dataframe Tokens-Frequencies
    from nltk.probability import FreqDist
    fdist = FreqDist()
    
    for token in tokens:
        fdist[token] += 1 
    tokens_freq = pd.DataFrame(list(fdist.items()), columns = ["Tokens","Frequencies"])
    
    # Sort the dataframe according to frequency of words
    tokens_freq.sort_values(by='Frequencies',ascending=False, inplace=True)
    
    return tokens_freq

def stop_words(df):
    '''
        df : DataFrame
    '''
    from nltk.corpus import stopwords
    
    liste = []
    i = 0 
    for i in range(df.shape[0]):
        if df['Tokens'][i] not in stopwords.words('english'):
            liste.append([df['Tokens'][i],df['Frequencies'][i]])
    return pd.DataFrame(liste,columns=["Tokens","Frequencies"])

### Vectorization Methods 

**Bag of Words (BOW)**

In [86]:
def bag_of_words(df, nbr_tokens, token_frequency):
    from nltk.tokenize import TweetTokenizer 

    # Most frequent tokens
    most_freq = token_frequency['Tokens'][:nbr_tokens]

    # Vectorization 
    matrix = []
    for tweet in df['Tweets']:
        vector = []
        tknz = TweetTokenizer()
        tweet = tknz.tokenize(tweet)
        for token in most_freq:
            if token in tweet:
                vector.append(1)
            else:
                vector.append(0)
        matrix.append(vector)

    # Convert the matrix into a dataframe
    matrix = pd.DataFrame(matrix, columns=most_freq)

    return matrix

**TF-IDF**

In [87]:
def tfidf(df, nbr_tokens, token_frequency, ngram):
    from tqdm import tqdm
    from sklearn.feature_extraction.text import TfidfVectorizer


    tf_idf = TfidfVectorizer(max_features=nbr_tokens,
                             binary=True,
                             smooth_idf=False,
                             min_df=5, 
                             max_df=0.7)

    return tf_idf.fit_transform(np.array(df['Tweets'])).toarray()

**Hashing**

In [88]:
def hashing(df, nbr_tokens):

    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(n_features=nbr_tokens)
    
    return vectorizer.transform(df['Tweets']).toarray()

**Word2Vec** is a method to vectorize words and create new models (combinations) of words to add to features.<br> 
This method is used over bow, tf-idf or hashing. It has two main advantages : 
* Dimensionality reduction (features) 
* It capture meanings of the words, semantic relationships and context 

Word2Vec is a combination of two shallow neural networks : **CBOW** & **Skip-gram**. <br> 
&emsp;**CBOW** tends to predict the probability of a word given a context. <br> 
&emsp;**Skip-gram** model tries to predict the context for a given word (reverse manner). <br> 

Word2Vec needs a pre-trained word vector and here are the most known : <br> 
&emsp;Google News Word Vectors <br> 
&emsp;Freebase names <br> 
&emsp;DBPedia vectors (wiki2vec) <br> 

In [148]:
def word2vec(df, nbr_tokens, token_frequency):
    
    from gensim.models import Word2Vec
    
    model = Word2Vec(
            token_frequency['Tokens'],
            size = 200, 
            #window = 5,                   # context window size
            #min_count = 2,                # Ignores all words with total frequency lower than 2.                                  
            #sg = 1,                       # 1 for skip-gram model
            #hs = 0,
            #negative = 10,                # for negative sampling
            #workers= 32,                  # no.of cores
            seed = 34) 

    model.train(token_frequency['Tokens'], total_examples= len(df['Tweets']), epochs=10)
    return model 

**Vectorization function**

In [218]:
def vectorization(df, nbr_tokens, token_frequency, method):
    '''
        
    '''
    
    if method == "bow":
        return bag_of_words(df, nbr_tokens, token_frequency)  

    elif method == "tfidf":
        return tfidf(df, nbr_tokens, token_frequency, ngram=(1,1))
        
    elif method == "hashing":
        return hashing(df, nbr_tokens) 
    
    elif method == "word2vec":
        return word2vec(df, nbr_tokens, token_frequency) 

### Preprocessing 

In [117]:
def preprocessing(dataset, nbr_tokens, vectorizer):
    '''
        
    '''

    # Copy the dataset
    df = dataset.copy()
    y = df['Label']
    
    
    # manipulations
    df_cleaned = data_cleaning(df)
    
    # tokenization
    tokens = tokenization(df_cleaned)
    
    # stemming
    tokens_stemmed = stemming(tokens)
    
    # tokens_frequencies 
    tokfreq = tokens_frequencies(tokens_stemmed)
    
    # Stop words 
    tokfreq = stop_words(tokfreq)
    
    print(tokfreq)
    
    # Generate a CSV file for Tokens-Frequencies
    #tokfreq.to_csv("Word-Frenquency.csv")
    
    # vectorization
    X = vectorization(df, nbr_tokens, tokfreq, vectorizer)
    
    # Encoding target 
    #y = encoding_target(y)
    print(y.value_counts())

    # Split the data : Train set & Test set 
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)
    
    return pd.DataFrame(X_train), pd.DataFrame(X_test), pd.DataFrame(y_train), pd.DataFrame(y_test)

In [118]:
%%time 
X_train, X_test, y_train, y_test = preprocessing(df[:150], 10, "hashing")

             Tokens  Frequencies
0             video            3
1         peshmerga            1
2        decimating            1
3              isis            7
4               far            3
..              ...          ...
841    hillaryemail            1
842  hillaryclinton            1
843            cmon            1
844          shelli            1
845          emilie            1

[846 rows x 2 columns]
0.0    93
1.0    57
Name: Label, dtype: int64
CPU times: user 253 ms, sys: 30.6 ms, total: 284 ms
Wall time: 278 ms


### Vectorizers comparison 

**Chosen Model : SVM** 

In [107]:
def support_vector_machine(X_train, X_test, y_train, y_test):
    # training 
    from sklearn.svm import SVC 
    model = SVC(kernel='linear', random_state=0)
    model.fit(X_train, y_train)
    
    # prediction 
    y_pred = model.predict(X_test)
    
    # evaluation 
    from sklearn.metrics import accuracy_score #, classification_report
    return accuracy_score(y_test, y_pred)

<u>**First comparison :**</u>
* Number of Features = 20 
* Number of Tweets = 15000
* Same data cleaning 
* Stemming 

**Bag Of Words**

In [109]:
%%time 
X_train, X_test, y_train, y_test = preprocessing(df[:15000], 20, "bow")
print("Accuracy = ", support_vector_machine(X_train, X_test, y_train, y_test))

0.0    10270
1.0     4730
Name: Label, dtype: int64
Accuracy =  0.6922222222222222
CPU times: user 20.3 s, sys: 835 ms, total: 21.1 s
Wall time: 21.1 s


**TF-IDF**

In [110]:
%%time
X_train, X_test, y_train, y_test = preprocessing(df[:15000], 20, "tfidf")
print("Accuracy = ", support_vector_machine(X_train, X_test, y_train, y_test))

0.0    10270
1.0     4730
Name: Label, dtype: int64
Accuracy =  0.72
CPU times: user 19.5 s, sys: 929 ms, total: 20.4 s
Wall time: 20.2 s


**Hashing**

In [111]:
%%time
X_train, X_test, y_train, y_test = preprocessing(df[:15000], 20, "hashing")
print("Accuracy = ", support_vector_machine(X_train, X_test, y_train, y_test))

0.0    10270
1.0     4730
Name: Label, dtype: int64
Accuracy =  0.6922222222222222
CPU times: user 18.1 s, sys: 752 ms, total: 18.8 s
Wall time: 18.6 s


<u>**Some vectors using Word2Vec**</u>

**Using a pre-trained word vector** (GoogleNews)

In [220]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True, limit=10000)

In [222]:
# vocabulary
#list(model.wv.vocab)  # len(vocabulary) = limit 

In [223]:
# Examples 
model.wv.most_similar('man')

[('woman', 0.7664012908935547),
 ('boy', 0.6824870109558105),
 ('teenager', 0.6586930155754089),
 ('girl', 0.5921714305877686),
 ('men', 0.5489763021469116),
 ('guy', 0.5420035123825073),
 ('person', 0.5342026352882385),
 ('Man', 0.5316052436828613),
 ('suspect', 0.5247484445571899),
 ('victim', 0.523030161857605)]

In [224]:
# Build a vector from others (model)
vect = model['United_States'] + model['military']  # Try to guess the result ? 
model.wv.most_similar([vect])

[('military', 0.838738203048706),
 ('United_States', 0.7528706192970276),
 ('armed_forces', 0.6437433362007141),
 ('U.S.', 0.6348716020584106),
 ('civilian', 0.5731287002563477),
 ('Pentagon', 0.5701889991760254),
 ('Military', 0.5427780151367188),
 ('Afghanistan', 0.5392686128616333),
 ('Iraq', 0.526528000831604),
 ('Army', 0.520842432975769)]

**Train on our data set** 

In [210]:
%%time 

from gensim.models import Word2Vec

def word2vec(df, nbr_tokens, token_frequency):
    
    from gensim.models import Word2Vec
    
    tokens = token_frequency['Tokens'].tolist()
    
    model = Word2Vec(
            tokens,
            size = 200, 
            window = 10,                   # context window size
            min_count = 2,                # Ignores all words with total frequency lower than 2.                                  
            #sg = 1,                       # 1 for skip-gram model
            #hs = 0,
            #negative = 10,                # for negative sampling
            workers= 4,                  # no.of cores
            seed = 34) 

    model.train(tokens, total_examples= len(tokens), epochs=10)
    return model 


# Import Data - Data Cleaning - Preprocessing 
data = df[:1000].copy()
y = data['Label']
df_cleaned = data_cleaning(data)
tokens = tokenization(df_cleaned)
tokens_stemmed = stemming(tokens)
tokfreq = tokens_frequencies(tokens_stemmed)
tokfreq = stop_words(tokfreq)
print(tokfreq['Tokens'].head())

# Word2Vec Training 
model = word2vec(data, 1000, tokfreq)

0         video
1     peshmerga
2    decimating
3          isis
4           far
Name: Tokens, dtype: object
CPU times: user 1.47 s, sys: 151 ms, total: 1.62 s
Wall time: 1.56 s


In [217]:
#list(model.wv.vocab)
#model.wv.most_similar(positive="video")

maliste = tokfreq['Tokens'][:31].tolist()
model = Word2Vec(maliste, size = 200, window = 10, min_count = 2, workers= 4) 
model.train(maliste, total_examples= len(maliste), epochs=10)
list(model.wv.vocab)

['i',
 'd',
 'e',
 'o',
 'p',
 's',
 'h',
 'm',
 'r',
 'g',
 'a',
 'c',
 't',
 'n',
 'f',
 'l',
 'y',
 'k',
 'w',
 'â']