# Importing Neccessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
nltk.download('punkt_tab')  #for tokenizatiion
nltk.download('stopwords')  #for stopowords removal
nltk.download('wordnet')    #for lemmatization

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score


from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import gensim.downloader

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier


from keras.layers import Dense,Dropout,Input
from keras.models import Sequential

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\padma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\padma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\padma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Importing Dataset

In [2]:
filepath = r"C:\Users\padma\Desktop\Computer_Science\al-ml\Datasets\tweets.csv"

data = pd.read_csv(filepath,index_col='id')  # loading the csv file with the id column as the index column

In [3]:
pd.set_option('display.max_colwidth', None)

data.head()

# label column shows the sentimence -0 indicating negative sentment and 1 indicating positive sentment
#tweet column incudes tweets of different people regarding technology companies

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


# Creating Neccessary Functions

### Creating Necessary functions for text preprocessing

In [4]:
# punctuation removal

def remove_punctuations(document):
    '''Function to remove punctuations in a document.
    (here document corresponds to a single review)'''

    punc_free = []
    for letter in document:
        if letter not in string.punctuation:
            punc_free.append(letter)
    return ''.join(punc_free)  #returns the document without punctuations

In [5]:
# Tokenization

def tokenise(document):
    ''' Converts a document/review to a list of words'''
    tokens = nltk.word_tokenize(document)

    return tokens  #returns a list of words

In [6]:
# stopwords removal

stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(words_list):
    '''Function to remove the stop words from a list of words after tokenization'''

    stopword_removed = []
    for word in words_list:
        if word not in stopwords:
            stopword_removed.append(word)

    return stopword_removed #returns a list of words

In [7]:
# Lemmatization

def lemmatizer(words_list):
    '''Convert each word in a list to its lemmatized form'''
    lemmatized = []
    for word in words_list:
        lemmatized.append(nltk.WordNetLemmatizer().lemmatize(word))
    
    return lemmatized  #returns a list of lemmatized words


In [8]:
# Combining all the functions

def text_preprocessor(corpus):
    '''Function does all the essential text preprocessing on corpus and returns 
    the corpus containing each document as a single string'''

    preprocessed = []

    for document in corpus:
        new_doc = remove_punctuations(document)
        new_doc = new_doc.lower()
        words_list = tokenise(new_doc)
        words_list = remove_stopwords(words_list)
        words_list = lemmatizer(words_list)

        preprocessed.append(' '.join(words_list))

    return preprocessed  #returns the corpus of documents after all the text preprocessing

In [9]:
# testing the functions
input = ['This is the best joke i have ever heard in my entire life and is so mesmerising']

text_preprocessor(input)

['best joke ever heard entire life mesmerising']

### Function for multiple model fitting and selecting the best

In [10]:
def multiple_model_fit(x_train,x_test,y_train,y_test):
    model_list = [RandomForestClassifier(),
              DecisionTreeClassifier(),
              LogisticRegression(),
              KNeighborsClassifier(),
              AdaBoostClassifier(algorithm='SAMME'),
              BaggingClassifier(),
              XGBClassifier()]
    accuracy_comparison = {}
    for model in model_list:
        classifier = model
        classifier.fit(x_train,y_train)
        y_pred = classifier.predict(x_test)
        cross_val_acc = cross_val_score(classifier,x_train,y_train, cv=10,scoring= 'accuracy')
        test_acc = accuracy_score(y_pred,y_test)
        train_acc = accuracy_score(classifier.predict(x_train),y_train)

        accuracy_comparison[str(classifier.__class__.__name__)] = {'train_acc': train_acc,
                                                                   'cross_val_acc': np.average(cross_val_acc),
                                                                   'test_acc': test_acc}

    return accuracy_comparison

# Preprocessing

### Preprocessing the corpus of data using the above functions

In [11]:
#Checking if there is any nan values in the data
data.isna().sum()

label    0
tweet    0
dtype: int64

In [12]:
data.shape

(7920, 2)

In [13]:
data['preprocessed'] = text_preprocessor(data['tweet'])

In [14]:
data

Unnamed: 0_level_0,label,tweet,preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test httpsgooglh1mfqv android apps beautiful cute health igers iphoneonly iphonesia iphone
2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally transparant silicon case thanks uncle yay sony xperia sonyexperias… httpinstagramcompyget5jc6jm
3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,love would go talk makememories unplug relax iphone smartphone wifi connect httpfbme6n3lsupcu
4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,im wired know im george made way iphone cute daventry home httpinstagrampli5ujs4k
5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,amazing service apple wont even talk question unless pay 1995 stupid support
...,...,...,...
7916,0,Live out loud #lol #liveoutloud #selfie #smile #sony #music #headphones https://instagram.com/p/5spiNsJ_c9/,live loud lol liveoutloud selfie smile sony music headphone httpsinstagramcomp5spinsjc9
7917,0,"We would like to wish you an amazing day! Make every minute count #tls #today #iphone #accessories #news #life February 23, 2017 at 0…",would like wish amazing day make every minute count tl today iphone accessory news life february 23 2017 0…
7918,0,Helping my lovely 90 year old neighbor with her iPad this morning has just made me realise that 'I' don't actually need an I pad!,helping lovely 90 year old neighbor ipad morning made realise dont actually need pad
7919,0,"Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/",finally got smart pocket wifi stay connected anytimeanywhere ipad samsung s3 gadget httpinstagrampu53gvju8


# BOW Model

In [15]:
x_bow = CountVectorizer().fit_transform(data['preprocessed'])  #this is a sparse matrix
y = data['label']

## Train Test Split


In [16]:
x_train_bow , x_test_bow, y_train, y_test = train_test_split(x_bow,y,test_size = 0.2, random_state = 30)

## Fitting the data to a model

In [17]:
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(x_train_bow,y_train)

## Prediction and Checking accuracy

In [18]:
y_pred_bow = classifier.predict(x_test_bow)

In [19]:
accuracy_score(y_pred_bow,y_test)

0.8693181818181818

In [20]:
results = multiple_model_fit(x_train_bow,x_test_bow,y_train,y_test)


In [21]:
for key, value in results.items():
    print(key,';', value)


RandomForestClassifier ; {'train_acc': 1.0, 'cross_val_acc': 0.863478204534015, 'test_acc': 0.8674242424242424}
DecisionTreeClassifier ; {'train_acc': 1.0, 'cross_val_acc': 0.8434314590279127, 'test_acc': 0.8396464646464646}
LogisticRegression ; {'train_acc': 0.9804292929292929, 'cross_val_acc': 0.882728332859898, 'test_acc': 0.8768939393939394}
KNeighborsClassifier ; {'train_acc': 0.8746843434343434, 'cross_val_acc': 0.7548980619054027, 'test_acc': 0.7619949494949495}
AdaBoostClassifier ; {'train_acc': 0.7689393939393939, 'cross_val_acc': 0.764359790891105, 'test_acc': 0.7569444444444444}
BaggingClassifier ; {'train_acc': 0.9876893939393939, 'cross_val_acc': 0.8579574506256822, 'test_acc': 0.8680555555555556}
XGBClassifier ; {'train_acc': 0.9187184343434344, 'cross_val_acc': 0.8727899292837173, 'test_acc': 0.88510101010101}


some of the models above are overfitted while others are giving good results

# TF-IDF Model

In [22]:
x_tfidf = TfidfVectorizer().fit_transform(data['preprocessed'])  #this is a sparse matrix
y = data['label']

## Train test split

In [23]:
x_train_tfidf , x_test_tfidf, y_train, y_test = train_test_split(x_tfidf,y,test_size = 0.2, random_state =42 )

## Model fitting and prediction

In [24]:
classifier.fit(x_train_tfidf,y_train)
y_pred = classifier.predict(x_test_tfidf)

In [25]:
accuracy_score(y_pred,y_test)

0.8674242424242424

# Word 2 Vec Model

### Trying another preprocessing method

In [26]:
x = data['tweet'].apply(lambda x: simple_preprocess(x))
y = data['label']

In [27]:
x

id
1                 [fingerprint, pregnancy, test, https, goo, gl, mfqv, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]
2                  [finally, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, sonyexperias, http, instagram, com, yget, jc, jm]
3                       [we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect, http, fb, me, lsupcu]
4                                                  [wired, know, george, was, made, that, way, iphone, cute, daventry, home, http, instagr, am, li_]
5                    [what, amazing, service, apple, won, even, talk, to, me, about, question, have, unless, pay, them, for, their, stupid, support]
                                                                            ...                                                                     
7916                                    [live, out, loud, lol, liveoutloud, selfie, smile, sony, music,

### Applying W2V

In [28]:
w2v = Word2Vec(x, min_count=1,vector_size=100)

In [29]:
for i in range(10):
    print(w2v.wv.index_to_key[i], w2v.wv[i])   #index to key returns the list of words corresponding to keys

iphone [-0.4677379   1.0345596   0.16269268  0.3365102  -0.02488126 -1.2513467
  0.9100487   3.2390327  -1.4216112  -0.9046809  -0.14559744 -1.5344481
 -0.3128335   0.41446143 -0.12406942 -0.7105876   0.6762021  -1.1055255
 -0.818228   -2.217816    0.81295246  0.27484772  0.9846966  -0.6422678
 -0.32317358  0.16323045 -0.5527937  -0.5150499  -0.03225864  0.7850146
  1.0603211   0.08075164  0.5548768  -1.7936219  -0.35320923  1.142362
  0.1726364  -0.71893185 -0.63620234 -2.076524    0.2609849  -0.41180578
 -0.32335415  0.55975825  0.61651623 -0.05456543 -1.2998763  -0.04069421
  0.81606174  0.91721505  0.23087724 -1.0206473  -0.2810538   0.139958
 -0.6365948  -0.04925267  0.66202253 -0.25269008 -0.9787175   0.7062921
 -0.12404737  0.040871    0.34122884 -0.26930675 -1.8728644   1.1735011
  0.2651269   0.8629392  -1.136658    0.90629184 -0.4684563   0.358729
  1.118378   -0.45654884  1.0941231  -0.32067513  0.6411822   0.04849484
 -1.1220155   0.21352488 -1.0930827  -0.42299268 -1.25327

In [30]:
# x_w2v = []
# x_w2v_doc = []


# for doc in x:
#     for word in doc:
#         if word in words:
#             x_w2v_doc.append(w2v.wv[word])
  
#     x_w2v.append(np.array(x_w2v_doc).mean(axis = 0))

# x_w2v = np.array(x_w2v)

# Since list comprehension is musch faster , conver/ting the same function above to a more concise form
words= w2v.wv.index_to_key
x_w2v = np.array([np.array([w2v.wv[word] for word in doc if word in words]).mean(axis = 0) for doc in x])
x_w2v.shape



(7920, 100)

### Train test split

In [31]:
x_train_w2v , x_test_w2v, y_train, y_test = train_test_split(x_w2v,y,test_size = 0.2, random_state = 42,stratify=y)
x_train_w2v.shape

(6336, 100)

### Model fitting and Prediction

In [32]:
classifier.fit(x_train_w2v,y_train)
y_pred = classifier.predict(x_test_w2v)

In [33]:
accuracy_score(y_pred,y_test)

0.8636363636363636

In [34]:
results = multiple_model_fit(x_train_w2v,x_test_w2v,y_train,y_test)

In [35]:
for key,value in results.items():
    print(key,':',value)

RandomForestClassifier : {'train_acc': 0.9996843434343434, 'cross_val_acc': 0.8628465421781014, 'test_acc': 0.8674242424242424}
DecisionTreeClassifier : {'train_acc': 1.0, 'cross_val_acc': 0.8154927963082015, 'test_acc': 0.82260101010101}
LogisticRegression : {'train_acc': 0.8726325757575758, 'cross_val_acc': 0.8727899292837173, 'test_acc': 0.8712121212121212}
KNeighborsClassifier : {'train_acc': 0.8915719696969697, 'cross_val_acc': 0.8518005491849439, 'test_acc': 0.8529040404040404}
AdaBoostClassifier : {'train_acc': 0.8672664141414141, 'cross_val_acc': 0.8655286777201351, 'test_acc': 0.865530303030303}
BaggingClassifier : {'train_acc': 0.9895833333333334, 'cross_val_acc': 0.8472179446927903, 'test_acc': 0.8566919191919192}
XGBClassifier : {'train_acc': 1.0, 'cross_val_acc': 0.8622151289986595, 'test_acc': 0.865530303030303}


some of the models above are overfitted while others are giving good results

# Pre Trained Models


In [36]:
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [37]:
# Converting the words to vectors and taking avg to represent sentences

words= glove_vectors.index_to_key
x_glove = np.array([np.array([glove_vectors[word] for word in doc if word in words]).mean(axis = 0) for doc in x])
x_glove.shape

(7920, 25)

### Train test split

In [38]:
x_train_glove , x_test_glove, y_train, y_test = train_test_split(x_glove,y,test_size =0.2 , random_state = 0)

### Model fitting and prediction

In [39]:
classifier.fit(x_train_glove,y_train)
y_pred = classifier.predict(x_test_glove)
accuracy_score(y_pred,y_test)

0.8863636363636364

In [40]:
results = multiple_model_fit(x_train_glove,x_test_glove,y_train,y_test)

In [41]:
for key,value in results.items():
    print(key,':',value)

RandomForestClassifier : {'train_acc': 0.9996843434343434, 'cross_val_acc': 0.8781579380148609, 'test_acc': 0.8787878787878788}
DecisionTreeClassifier : {'train_acc': 0.9998421717171717, 'cross_val_acc': 0.8125039245294303, 'test_acc': 0.8314393939393939}
LogisticRegression : {'train_acc': 0.8849431818181818, 'cross_val_acc': 0.8836794394526091, 'test_acc': 0.8838383838383839}
KNeighborsClassifier : {'train_acc': 0.9078282828282829, 'cross_val_acc': 0.8694798690328465, 'test_acc': 0.875}
AdaBoostClassifier : {'train_acc': 0.8779987373737373, 'cross_val_acc': 0.867111197492288, 'test_acc': 0.8699494949494949}
BaggingClassifier : {'train_acc': 0.9917929292929293, 'cross_val_acc': 0.8593772581617752, 'test_acc': 0.8705808080808081}
XGBClassifier : {'train_acc': 0.9998421717171717, 'cross_val_acc': 0.8789460831950404, 'test_acc': 0.8781565656565656}


some of the models above are overfitted while others are giving good results