## Importing libraries and dataset

In [1]:
import numpy as np
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import wordninja as wn
import matplotlib as plt
import seaborn as sns
import re
from ekphrasis.classes.spellcorrect import SpellCorrector
from gingerit.gingerit import GingerIt #pip3 install gingerit

In [2]:
PATH_TRAIN_NEG = '../Resources/train_neg.txt'
PATH_TRAIN_POS = '../Resources/train_pos.txt'

with open(PATH_TRAIN_POS) as f:
    train_pos = f.read().splitlines()
with open(PATH_TRAIN_NEG) as f:
    train_neg = f.read().splitlines()

## Preprocessing the dataset

In [3]:
def replace_ponctuation(tweet):
    
    #replace multiple stops by the word 'consecutivestop'
    tweet = re.sub(r"(\.)\1+", ' consecutiveStop ', tweet)
    #replace multiple exclamation by the word 'consecutivequestion'
    tweet = re.sub(r"(\?)\1+", ' consecutiveQuestion ', tweet)
    #replace multiple exclamation by the word 'consecutiveexclamation'
    tweet = re.sub(r"(\!)\1+", ' consecutiveExclamation ', tweet)
    #delete all ponctuaction
    tweet = re.sub(r"[,.;@?!#&$\"]+\ *", ' ', tweet)
    #deleting consecutive spaces
    tweet = re.sub(r"\s+", ' ',tweet)
    
    return tweet

In [4]:
def letter_repetition_treatment(tweet) : 
    
    word = re.sub(r'(.)\1+', r'\1\1', tweet)
    
    return word

In [5]:
def emoji_treatment(tweet):
    
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' negative ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' negative ', tweet)
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' positive ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positive ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' love ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' positive ', tweet)

    return tweet


In [6]:
# TO DO : correct behaviour 
def hashtag_treatment(tweet_set):
    
    for line in range(len(tweet_set)) :
        tweet = tweet_set[line]
        tweet = np.array(tweet.split())
        for word in tweet :
            if '#' in word :
                index = np.where(tweet ==word)
                word = " ".join(wn.split(word))
                if (isinstance(tweet, str)):
                    tweet.replace('#', '')
                else :
                    tweet[index] = word
                tweet = " ".join(tweet)
                tweet_set[line]=tweet
            
    return tweet_set

In [7]:
hashtag_treatment(['#ivegotnews','wtf is going on #MachineLearningDrivingMeCrazy'])

['ive got new', 'wtf is going on Machine Learning Driving Me Cr']

In [20]:
#we introduce error with "'s" ex : my sister's best friend is ... 
def apostrophe_contraction(tweet_list) :
    
    contractions = {
        '\'m' : ' am',
        'im' : ' I am',
        'ive' : 'I have',
        '\'re' : ' are', 
        '\'ve' : ' have',
        '\'s' : ' is', 
        '\'ll' : ' will',
        '\'d' : ' would', 
        '\'t' : ' not',
        'ain\'t' : 'not',
        'aint' : 'not',
        'can\'t' : 'can not',
        'cant' : 'can not',
        'don\'t' : 'do not',
        'dont' : 'do not',
        'isn\'t' : 'is not',
        'isnt' : 'is not',
        'won\'t' : 'will not',
        'wont' : 'will not',
        'shouldn\'t' : 'should not',
        'shouldnt' : 'should not',
        'couldn\'t' : 'could not',
        'wouldn\'t' : 'would not', 
        'aren\'t' : 'are not', 
        'arent' : 'are not', 
        'doesn\'t' : 'does not',
        'doesnt' : 'does not',
        'wasn\'t' : 'was not',
        'wasnt' : 'was not',
        'weren\'t' : 'were not',
        'werent' : 'were not',
        'hasn\'t' : 'has not', 
        'haven\'t' : 'have not',
        'havent' : 'have not',
        'hadn\'t' : 'had not', 
        'mustn\'t' : 'must not', 
        'didn\'t' : 'did not', 
        'mightn\'t' : 'might not', 
        'needn\'t' : 'need not',
        'imma' : 'I am going to',
        'wanna' : 'want to',
        'gonna' : 'going to',
        'thats' : 'that is',
    }
    pat = re.compile(r"\b(%s)\b" % "|".join(contractions))

    return [pat.sub(lambda m: contractions.get(m.group()), tweet.lower()) for tweet in tweet_list]


In [21]:
def correct_slang(tweet_list) : 
    slang = {
        '2nite' : 'tonight',
        '2night' : 'tonight',
        '2' : 'to',
        '4' : 'for',
        '' : '',
        'ab' : 'about',
        'ace' : 'success',
        'ad' : 'awesome person',
        'aka' : 'meaning',
        'asap' : 'soon',
        'aww' : 'cute',
        'bc' : 'because',
        'bf' : 'boyfriend',
        'bff' : 'best friend',
        'brb' : 'I come',
        'btw' : 'by the way',
        'cus' : 'because',
        'cuz' : 'because',
        'cya' : 'see you',
        'dammit' : 'damn it',
        'der' : 'there',
        'dm' : 'message me',
        'dunno' : 'do not know',
        'dw' : 'okay',
        'ew' : 'gross',
        'ftw' : 'win',
        'fyi' : 'for information',
        'gf' : 'girlfriend',
        'gotta' : 'has',
        'gurl' : 'girl',
        'haha' : 'laught',
        'hahah' : 'laught',
        'hahaha' : 'laught',
        'hahahah' : 'laught',
        'hahahaha' : 'laught',
        'hmu' : 'message me',
        'idk' : 'do not know',
        'idc' : 'do not care',
        'ily' : 'love',
        'imo' : 'think',
        'irl' : 'real life',
        'jk' : 'laught',
        'lmao' : 'laught',
        'lmk' : 'let me know',
        'lil' : 'little',
        'lol' : 'laught',
        'luv' : 'love',
        'ppl' : 'people',
        'n' : 'and',
        'nbd' : 'okay', #no big deal
        'np' : 'okay', #no problem
        'nvm' : 'okay', #never mind
        'omg' : 'amazing', #oh my god
        'omw' : "come",
        'r' : 'are',
        'rofl' : 'laught',
        'roflmao' : 'laught',
        'rn' : 'now',
        'rt' : 'retweet',
        'sch' : 'school',
        'tbh' : 'honestly',
        'til' : 'until',
        'thx' : 'thanks',
        'ttyl' : 'talk later',
        'u' : 'you',
        'ur' : 'your',
        'w' : 'with',
        'wan' : 'want',
        'waz' : 'what is',
        'wtf' : 'seriously',
        'x' : 'kiss',
        'xx' : 'kiss',
        'xo' : 'kiss',
        'xoxo' : 'kiss',
        'xd' : 'laught',
        'ya' : 'you',
        'yolo' : 'enjoy',
        'yuck' : 'gross',
    }
    pat = re.compile(r"\b(%s)\b" % "|".join(slang))

    return [pat.sub(lambda m: slang.get(m.group()), tweet.lower()) for tweet in tweet_list]  

In [22]:
def correct_slang2(tweet_list) : 
    
    parser = GingerIt()
    for index in range(len(tweet_list)) : 
        tweet = tweet_list[index]
        t = parser.parse(tweet)
        tweet_list[index] = t.get('result')
        
    return tweet_list

In [23]:
def short_word_treatment(tweet):
    
    return " ".join([word for word in tweet.split() if len(word) > 1])

In [24]:
def numbers_treatment(tweet):
    
    new_tweet = []
    for word in tweet.split():
        try:
            word = re.sub('[,\.:%_\-\+\*\/\%\_]', '', word)
            float(word)
            new_tweet.append("")
        except:
            new_tweet.append(word)
            
    return " ".join(new_tweet)

In [25]:
def correct_spelling(tweet_list):
    
    sp = SpellCorrector(corpus="english")
    
    return [sp.correct_text(tweet) for tweet in tweet_list]
     

In [26]:
def non_alphabetic_treatment(tweet) : 

    return " ".join([word for word in tweet.split() if word.isalpha()])

In [27]:
train_pos = [replace_ponctuation(tweet) for tweet in train_pos]
train_neg = [replace_ponctuation(tweet) for tweet in train_neg]

In [28]:
train_pos = [letter_repetition_treatment(tweet) for tweet in train_pos]
train_neg = [letter_repetition_treatment(tweet) for tweet in train_neg]

In [29]:
train_pos = [emoji_treatment(tweet) for tweet in train_pos]
train_neg = [emoji_treatment(tweet) for tweet in train_neg]

In [30]:
train_pos = hashtag_treatment(train_pos)
train_neg = hashtag_treatment(train_neg)

In [31]:
train_pos = apostrophe_contraction(train_pos)
train_neg = apostrophe_contraction(train_neg)

In [32]:
train_pos = correct_slang(train_pos)
train_neg = correct_slang(train_neg)

In [None]:
train_pos = correct_slang2(train_pos)
train_neg = correct_slang2(train_neg)

In [None]:
train_pos = [short_word_treatment(tweet) for tweet in train_pos]
train_neg = [short_word_treatment(tweet) for tweet in train_neg]

In [None]:
train_pos = [numbers_treatment(tweet) for tweet in train_pos]
train_neg = [numbers_treatment(tweet) for tweet in train_neg]

In [None]:
#train_pos = correct_spelling(train_pos)
#train_neg = correct_spelling(train_neg)

In [None]:
sp = SpellCorrector(corpus="english")
sp.correct_text("this is a text, withh some mistakds!!!")

In [None]:
train_pos = [non_alphabetic_treatment(tweet) for tweet in train_pos]
train_neg = [non_alphabetic_treatment(tweet) for tweet in train_neg]

In [None]:
def label_data(train_pos,train_neg):
    
    train_pos = np.array(train_pos).reshape(-1,1)
    ones = np.ones(shape=(train_pos.shape[0],1))
    train_pos = np.concatenate((train_pos,ones),axis = 1)

    train_neg = np.array(train_neg).reshape(-1,1)
    neg_ones = np.zeros(shape=(train_neg.shape[0],1))-1
    train_neg = np.concatenate((train_neg,neg_ones),axis = 1)
    
    return (train_pos,train_neg)

In [None]:
train_pos,train_neg = label_data(train_pos,train_neg)

## Impact of \<user> and \<url> on the classification

In [None]:
def user_tag_impact(train_pos,train_neg):
    
    user = "<user>"
    user_count_pos = 0
    user_count = 0
    
    for i in range(len(train_pos)):
        if user in train_pos[i] :
            user_count += 1
            user_count_pos += 1
            
    for i in range(len(train_neg)):
        if user in train_neg[i] :
            user_count += 1
            
    user_count_neg = user_count - user_count_pos
    counts = np.array([user_count,user_count_pos,user_count_neg])

    user_dict = {"Positive Sentiment Tweet":user_count_pos,"Negative Sentiment Tweet":user_count_neg}
    keys = list(user_dict.keys())
    vals = [user_dict[k] for k in keys]
    ax1 = sns.barplot(x=keys, y=vals)   
    ax1.set_xlabel("Sentiment type", fontsize = 10)
    ax1.set_ylabel("Number of Tweets", fontsize = 10)
    ax1.set_title("User Tag Presence impact on Tweet Sentiment",fontsize = 20,pad=25)

    return counts

In [None]:
url_tag_counts = url_impact(train_pos,train_neg)

In [None]:
def url_impact(train_pos,train_neg):
    
    url = "<url>"
    url_count_pos = 0
    url_count = 0
    
    for i in range(len(train_pos)):
        if url in train_pos[i] :
            url_count += 1
            url_count_pos += 1
            
    for i in range(len(train_neg)):
        if url in train_neg[i] :
            url_count += 1
            
    url_count_neg = url_count - url_count_pos
    counts = np.array([url_count,url_count_pos,url_count_neg])
    
    url_dict = {"Positive Sentiment Tweet":url_count_pos,"Negative Sentiment Tweet":url_count_neg}
    keys = list(url_dict.keys())
    vals = [url_dict[k] for k in keys]
    ax = sns.barplot(x=keys, y=vals)   
    ax.set_xlabel("Sentiment type", fontsize = 10)
    ax.set_ylabel("Number of Tweets", fontsize = 10)
    ax.set_title("Url Presence impact on Tweet Sentiment",fontsize = 20,pad=25)
    
    return counts

In [None]:
user_tag_counts = user_tag_impact(train_pos,train_neg)