# Importing Important Libraraies

In [1]:
#Importing libraries for data manipulation
import pandas as pd
import numpy as np
import re,string

#For preprocessing of text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#For building classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.svm import SVC

## Loading Dataset

In [2]:
def dataset_load(filepath):
    df = pd.read_csv(filepath, sep='\t', header=None)
    return df

## Preprocessing

In [3]:
# To convert contrcations to normal meaningful words
# Dictionary is taken from kaggle
contractions_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                    "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  
                    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                    "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                    "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                    "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", 
                    "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                    "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                    "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                    "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                    "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
                    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                    "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                    "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is",
                    "that'd": "that would", "that'd've": "that would have", "that's": "that is",
                    "there'd": "there would", "there'd've": "there would have", "there's": "there is", 
                    "here's": "here is","they'd": "they would", "they'd've": "they would have", 
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are",
                    "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                    "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", 
                    "what'll've": "what will have", "what're": "what are",  "what's": "what is", 
                    "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", 
                    "where's": "where is", "where've": "where have", "who'll": "who will", 
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", 
                    "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
                    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                    "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                    "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def convert_contraction_to_full(tweet):
    if type(tweet) is str:
        for key in contractions_dict:
            value = contractions_dict[key]
            tweet = tweet.replace(key, value)
        return tweet
    else:
        return tweet
    

def preprocess_tweets(tweet):
    
    #Convert to lower case
    tweet = tweet.lower()
    
    #Remove numbers
    tweet = re.sub(r'\b([0-9.]+)\b', " ", tweet)
    
    #Replace usermentions
    r = re.findall("@[\w]*", tweet)
    for word in r:
        tweet = re.sub(word, "<usermention>", tweet)
    
    #Replace hashtags
    tweet = re.sub(r'#([a-z0-9]+)(?:\b|$)', "<hashtag>", tweet)

    #Remove alphanumeric
    tweet = re.sub(r'([^a-z0-9\s\<\>]|[\s]{2,})'," ", tweet)
    
    #Remove punctuations
    tweet = re.sub(r'[#\(\)\[\]\{\}\.!\?:\-]'," ", tweet)
    
    #remove shortwords
    tweet = " ".join([w for w in tweet.split() if len(w)>2])
    
    #Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", " ", tweet, flags = re.MULTILINE)
    
    #Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    stopwords_english = stopwords.words('english')
    filtered_words = [word for word in tweet_tokens if word not in stopwords_english]
    
    #Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in filtered_words]
    
    return " ". join(lemma_words)

In [4]:
df = dataset_load(r"C:/Users/harsh/Downloads/Warwick/NLP/Assignment/semeval-tweets.tar/semeval-tweets/twitter-training-data.txt")
df_test1 = dataset_load(r"C:/Users/harsh/Downloads/Warwick/NLP/Assignment/semeval-tweets.tar/semeval-tweets/twitter-test1.txt")
df_test2 = dataset_load(r"C:/Users/harsh/Downloads/Warwick/NLP/Assignment/semeval-tweets.tar/semeval-tweets/twitter-test2.txt")
df_test3 = dataset_load(r"C:/Users/harsh/Downloads/Warwick/NLP/Assignment/semeval-tweets.tar/semeval-tweets/twitter-test3.txt")

df = df.rename({0: 'Id', 1: 'label' , 2: 'tweet'}, axis='columns')
df['clean_tweet'] = np.vectorize(convert_contraction_to_full)(df['tweet'])
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: preprocess_tweets(x))

df_test1 = df_test1.rename({0: 'Id', 1: 'label' , 2: 'tweet'}, axis='columns')
df_test1['clean_tweet'] = np.vectorize(convert_contraction_to_full)(df_test1['tweet'])
df_test1['clean_tweet'] = df_test1['clean_tweet'].apply(lambda x: preprocess_tweets(x))

df_test2 = df_test2.rename({0: 'Id', 1: 'label' , 2: 'tweet'}, axis='columns')
df_test2['clean_tweet'] = np.vectorize(convert_contraction_to_full)(df_test2['tweet'])
df_test2['clean_tweet'] = df_test2['clean_tweet'].apply(lambda x: preprocess_tweets(x))

df_test3 = df_test3.rename({0: 'Id', 1: 'label' , 2: 'tweet'}, axis='columns')
df_test3['clean_tweet'] = np.vectorize(convert_contraction_to_full)(df_test3['tweet'])
df_test3['clean_tweet'] = df_test3['clean_tweet'].apply(lambda x: preprocess_tweets(x))

df.head()

Unnamed: 0,Id,label,tweet,clean_tweet
0,335104872099066692,positive,Felt privileged to play Foo Fighters songs on ...,felt privileged play foo fighters songs guitar...
1,796528524030124618,positive,@AaqibAfzaal Pakistan may be an Islamic countr...,< usermention > pakistan may islamic country d...
2,760964834217238632,positive,Happy Birthday to the coolest golfer in Bali! ...,happy birthday cool golfer bali < usermention ...
3,147713180324524046,negative,@SimpplyA TMILLS is going to Tucson! But the 2...,< usermention > tmills going tucson 29th thursday
4,732302280474120023,negative,Hmmmmm where are the #BlackLivesMatter when ma...,hmmmmm < hashtag > matters like rise kids disg...


## Vectorizing Tokens and Apply the model

In [5]:
# Classsifier 1 - Naive Bayes
#Use BOW for feature extraction

bow_vectorizer= CountVectorizer(max_df=0.90, min_df=2,max_features =3000,stop_words='english')
bow = bow_vectorizer.fit_transform(df['clean_tweet'])
classifier = MultinomialNB()
classifier.fit(bow,df['label'])
predicted_naive = classifier.predict(bow)
f1score = f1_score(df['label'], predicted_naive, average = 'macro', labels = ['positive', 'negative'])
print('Naive Bayes - For twitter-training-data.text the f1score is:', f1score)

def model_predict(df):
    pred_value = classifier.predict(bow_vectorizer.transform(df['clean_tweet']))
    f1score = f1_score(df['label'], pred_value, average = 'macro', labels = ['positive', 'negative'])
    return f1score

print('Naive Bayes - For twitter-test1.text the f1score is: ', model_predict(df_test1))
print('Naive Bayes - For twitter-test2.text the f1score is: ', model_predict(df_test2))
print('Naive Bayes - For twitter-test3.text the f1score is: ', model_predict(df_test3))

Naive Bayes - For twitter-training-data.text the f1score is: 0.6192223280922295
Naive Bayes - For twitter-test1.text the f1score is:  0.5175122921229576
Naive Bayes - For twitter-test2.text the f1score is:  0.5195061728395061
Naive Bayes - For twitter-test3.text the f1score is:  0.5031187467640597


In [6]:
#Classifier 2 - SVM
#TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_df=0.90, min_df=2,max_features =3000,stop_words='english')
vec = vectorizer.fit_transform(df['clean_tweet'])

svclassifier = SVC(kernel='linear')
svclassifier.fit(vec, df['label'])
y_pred = svclassifier.predict(vec)
f1score = f1_score(df['label'], y_pred, average = 'macro', labels = ['positive', 'negative'])
print('SVM - For twitter-training-data.text the f1score is: ', f1score)

def model_predict(df):
    pred_value = svclassifier.predict(vectorizer.transform(df['clean_tweet']))
    f1score = f1_score(df['label'], pred_value, average = 'macro', labels = ['positive', 'negative'])
    return f1score

print('SVM - For twitter-test1.text the f1score is: ', model_predict(df_test1))
print('SVM - For twitter-test2.text the f1score is: ', model_predict(df_test2))
print('SVM - For twitter-test3.text the f1score is: ', model_predict(df_test3))

SVM - For twitter-training-data.text the f1score is:  0.6174036774232595
SVM - For twitter-test1.text the f1score is:  0.5204917680621435
SVM - For twitter-test2.text the f1score is:  0.5256331964728098
SVM - For twitter-test3.text the f1score is:  0.4969621357352241
