# Notes 

October 19th 2020 

Status : 
* Remove IDs column Remove Twitter Ids, Remove punctuation, Convert uppercase into lowercase, Remove special characters, Delete URLs, Tokenization, Stemming, Generate token's frequencies, Stop words, First simple vectorization (bag of words), Create a "delete emojis" fonction, First modelisation with SVM, First Evaluation with f1 score.  

Further : 
* Develop a modular programming 
* Develop an evaluation function (include an evolution graphic) 
* Create a sub-branch & Conduct unit tests of each function in it 
* Explore others methods of vectorization - Implement the BERT method 
* Figure out the most import tokens in tweets 

# Libraries 

In [462]:
import pandas as pd
import string 
import re 

from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer 

import warnings
warnings.filterwarnings("ignore")

# Import the dataset

In [463]:
file = 'Data/data.csv'
dataframe = pd.read_csv(file, error_bad_lines=False, sep=";")

# Preprocessing

### Target encoding 

In [464]:
def encoding_target(y):
    for i in range(y.shape[0]):
        if y[i] == 'none':
            y[i] = 0 
        else:
            y[i] = 1 
    return y.astype('int')

### Data cleaning 

In [465]:
def data_cleaning(df):
    
    # Delete IDs
    df.drop('ID', axis=1, inplace=True)
    
    # First encoding 
    df['Label'].replace('none', 'not racist', inplace=True)
    df['Label'].replace('racism', 'racist', inplace=True)
    
    i = 0 
    for i in range(df['Tweets'].shape[0]):
        # Remove ids @ 
        df['Tweets'][i] = re.sub(r'@\S+', '', df['Tweets'][i])
        
        # Remove punctuation
        df['Tweets'][i] = "".join([char for char in df['Tweets'][i] if char not in string.punctuation])
        
        # Uppercase -> Lowercase 
        df['Tweets'][i] = df['Tweets'][i].lower()
        
        # Delete Url 
        df['Tweets'][i] = re.sub(r'http\S+', '', df['Tweets'][i])
        
        # Delete characters 
        df['Tweets'][i] = re.sub("ð|ÿ|‘|œ|¦|€|˜|™|¸|¤|‚|©|¡|…|”|“|‹|š|±|³|iâ|§|„|", '', df['Tweets'][i]) 
        
    return df

### Convert Emojis 

In [466]:
def convert_emojis(text): 
    import emoji
    text = emoji.demojize(text)
    text = text.replace('_','')
    return text.replace(':','')

In [467]:
convert_emojis("😉 😉 😅 😅")

'winkingface winkingface grinningfacewithsweat grinningfacewithsweat'

### Tokenization

In [468]:
def tokenization(df):
    # Generate tokens
    tknz = TweetTokenizer()
    tokens = []
    
    i = 0
    for i in range(df['Label'].shape[0]):
        tokens.extend(tknz.tokenize(df['Tweets'][i]))
    
    return tokens

### Stemming 

In [469]:
def stemming(tokens):
    stemming = PorterStemmer()
    for token in tokens:
        token = stemming.stem(token)
    return tokens

### Tokens_Frequencies 

In [470]:
def tokens_frequencies(tokens):
    
    # Creation of a dataframe Tokens-Frequencies
    fdist = FreqDist()
    for token in tokens:
        fdist[token] += 1 
    tokens_freq = pd.DataFrame(list(fdist.items()), columns = ["Tokens","Frequencies"])
    
    # Sort the dataframe according to frequency of words
    tokens_freq.sort_values(by='Frequencies',ascending=False, inplace=True)
    
    return tokens_freq

### Stop Words 

In [471]:
# Is not used yet 
def stop_words(df):
    from nltk.corpus import stopwords
    liste = []
    i = 0 
    for i in range(df.shape[0]):
        if df['Tokens'][i] not in stopwords.words('english'):
            liste.append([df['Tokens'][i],df['Frequencies'][i]])
    return pd.DataFrame(liste,columns=["Tokens","Frequencies"])

### Vectorization

In [472]:
def vectorization(df, nbr_tokens, nbr_tweets, token_frequency):
    # Most frequent tokens
    most_freq = token_frequency['Tokens'][:nbr_tokens]

    # Vectorization 
    matrix = []
    for tweet in df['Tweets'][:nbr_tweets]:
        vector = []
        tknz = TweetTokenizer()
        tweet = tknz.tokenize(tweet)
        for token in most_freq:
            if token in tweet:
                vector.append(1)
            else:
                vector.append(0)
        matrix.append(vector)
    
    # Convert the matrix into a dataframe
    bag_of_words = pd.DataFrame(matrix, columns=most_freq)
    
    return bag_of_words

### Preprocessing 

In [473]:
def preprocessing(dataset, nbr_tokens, nbr_tweets):
    
    # Copy the dataset
    df = dataset.copy()
    
    # manipulations
    df_cleaned = data_cleaning(df)
    
    # tokenization
    tokens = tokenization(df_cleaned)
    
    # stemming
    tokens_stemmed = stemming(tokens)
    
    # tokens_frequencies 
    tokfreq = tokens_frequencies(tokens_stemmed)
    
    # Stop words 
    tokfreq = stop_words(tokfreq)
    
    # Generate a CSV file for Tokens-Frequencies
    tokfreq.to_csv("Word-Frenquency.csv")
    
    # vectorization
    X = vectorization(df, nbr_tokens, nbr_tweets, tokfreq)
    
    # Encoding target 
    y = dataframe.iloc[:,-1]
    y = encoding_target(y)
    
    # Split the data : Train set & Test set 
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    return X_train, X_test, y_train, y_test

### Test preprocessing 

In [474]:
X_train, X_test, y_train, y_test = preprocessing(dataframe, nbr_tokens=100, nbr_tweets=dataframe.shape[0])

# Modeling 

In [475]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation 

In [476]:
from sklearn.metrics import f1_score
print("F1 Score = ", f1_score(y_test, y_pred))

F1 Score =  0.39191564147627417
