In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

In [7]:
data = pd.read_csv("./../datasets/sentiment_analysis.csv")
data

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [8]:
data.shape

(7920, 3)

In [9]:
data.duplicated().sum()

0

In [10]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [11]:
import re
import string

In [12]:
def to_lowercase(text):
    words = text.lower().split()
    lower_text = " ".join(words)
    return lower_text
    

In [13]:
data["tweet"]

0       #fingerprint #Pregnancy Test https://goo.gl/h1...
1       Finally a transparant silicon case ^^ Thanks t...
2       We love this! Would you go? #talk #makememorie...
3       I'm wired I know I'm George I was made that wa...
4       What amazing service! Apple won't even talk to...
                              ...                        
7915    Live out loud #lol #liveoutloud #selfie #smile...
7916    We would like to wish you an amazing day! Make...
7917    Helping my lovely 90 year old neighbor with he...
7918    Finally got my #smart #pocket #wifi stay conne...
7919    Apple Barcelona!!! #Apple #Store #BCN #Barcelo...
Name: tweet, Length: 7920, dtype: object

In [14]:
data["tweet"] = data["tweet"].apply(to_lowercase)

In [15]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test https://goo.gl/h1...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


In [16]:
def remove_link(text):
    pattern = r'https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    new_text = []
    for word in text.split():
        if not re.search(pattern, word):
            new_text.append(word)
    output = " ".join(new_text)
    return output

In [17]:
data["tweet"] = data["tweet"].apply(remove_link)

In [18]:
data

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test #android #apps #b...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...
...,...,...,...
7915,7916,0,live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,we would like to wish you an amazing day! make...
7917,7918,0,helping my lovely 90 year old neighbor with he...
7918,7919,0,finally got my #smart #pocket #wifi stay conne...


In [19]:
def remove_punctuations(text):
    for punc in string.punctuation:
        text = text.replace(punc, "")
    return text

In [20]:
data["tweet"] = data["tweet"].apply(remove_punctuations)

In [21]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally a transparant silicon case thanks to ...
2,3,0,we love this would you go talk makememories un...
3,4,0,im wired i know im george i was made that way ...
4,5,1,what amazing service apple wont even talk to m...
...,...,...,...
7915,7916,0,live out loud lol liveoutloud selfie smile son...
7916,7917,0,we would like to wish you an amazing day make ...
7917,7918,0,helping my lovely 90 year old neighbor with he...
7918,7919,0,finally got my smart pocket wifi stay connecte...


In [22]:
data["tweet"] = data["tweet"].str.replace("\d+", "", regex=True)

In [23]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally a transparant silicon case thanks to ...
2,3,0,we love this would you go talk makememories un...
3,4,0,im wired i know im george i was made that way ...
4,5,1,what amazing service apple wont even talk to m...
...,...,...,...
7915,7916,0,live out loud lol liveoutloud selfie smile son...
7916,7917,0,we would like to wish you an amazing day make ...
7917,7918,0,helping my lovely year old neighbor with her ...
7918,7919,0,finally got my smart pocket wifi stay connecte...


In [24]:
nltk.download('stopwords', download_dir="./../static/model")
    

[nltk_data] Downloading package stopwords to ./../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
with open("./../static/model/corpora/stopwords/english", "r") as file:
    sw = file.read().splitlines()
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [26]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in sw:
            new_text.append(word)
    output = " ".join(new_text)
    return output

In [27]:
data["tweet"] = data["tweet"].apply(remove_stopwords)

In [28]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally transparant silicon case thanks uncle ...
2,3,0,love would go talk makememories unplug relax i...
3,4,0,im wired know im george made way iphone cute d...
4,5,1,amazing service apple wont even talk question ...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfie smile sony mu...
7916,7917,0,would like wish amazing day make every minute ...
7917,7918,0,helping lovely year old neighbor ipad morning ...
7918,7919,0,finally got smart pocket wifi stay connected a...


In [29]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
nltk.download('wordnet', download_dir="./../static/model")

[nltk_data] Downloading package wordnet to ./../static/model...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:
wordnet_path = "./../static/model/"
nltk.data.path.append(wordnet_path)

In [31]:
def lemmatize_text(text):
    new_text = []
    for word in text.split():
        word = wnl.lemmatize(word)
        new_text.append(word)
        
    output = " ".join(new_text)
    return output

In [32]:
data["tweet"] = data["tweet"].apply(lemmatize_text)

In [33]:
data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally transparant silicon case thanks uncle ...
2,3,0,love would go talk makememories unplug relax i...
3,4,0,im wired know im george made way iphone cute d...
4,5,1,amazing service apple wont even talk question ...
...,...,...,...
7915,7916,0,live loud lol liveoutloud selfie smile sony mu...
7916,7917,0,would like wish amazing day make every minute ...
7917,7918,0,helping lovely year old neighbor ipad morning ...
7918,7919,0,finally got smart pocket wifi stay connected a...


# Vocabulary Building

In [34]:
from collections import Counter
vocabulary = Counter()

In [35]:
for tweet in data["tweet"]:
    vocabulary.update(tweet.split())

In [36]:
vocabulary

Counter({'iphone': 3943,
         'apple': 2875,
         'samsung': 1385,
         'new': 1137,
         'phone': 1014,
         'sony': 818,
         '…': 751,
         'follow': 719,
         'ipad': 522,
         'love': 451,
         'like': 444,
         'day': 426,
         'life': 418,
         'android': 414,
         'photo': 394,
         'io': 383,
         'rt': 378,
         'galaxy': 360,
         'instagram': 346,
         'case': 342,
         'cute': 321,
         'get': 317,
         'gain': 311,
         'today': 309,
         'im': 294,
         'photography': 292,
         'back': 289,
         'got': 280,
         'fun': 277,
         'news': 265,
         'music': 260,
         'time': 245,
         'app': 242,
         'p': 236,
         'happy': 235,
         'work': 230,
         'beautiful': 227,
         'instagood': 226,
         'smile': 226,
         'funny': 223,
         'one': 221,
         'girl': 220,
         'lol': 215,
         'fashion': 215,
  

In [37]:
data.shape

(7920, 3)

In [38]:
tokens = [word for word in vocabulary if vocabulary[word] > 10]

In [39]:
len(tokens)

1124

In [40]:
def save_vocabulary(vocab, filename):
    data = "\n".join(vocab)
    file = open(filename, 'w', encoding='utf-8')
    file.write(data)
    file.close()

save_vocabulary(tokens, "./../static/vocabulary.txt")

# Vectorizing

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
x = data["tweet"]
y = data["label"]

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [44]:
def vectorize(dataset, vocabulary):
    vectors = []
    for tweet in dataset:
        text = np.zeros(len(vocabulary))
        
        words = tweet.split()
        for i in range(len(vocabulary)):
            if vocabulary[i] in words:
                text[i] = 1
        
        vectors.append(text)
    vectorized_list = np.array(vectors, dtype=np.float32)
    return vectorized_list
                

In [45]:
x_train_vectorized = vectorize(x_train, tokens)

In [46]:
x_train_vectorized

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [47]:
x_test_vectorized = vectorize(x_test, tokens)

In [48]:
x_test_vectorized

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [49]:
y_train.value_counts()

label
0    4724
1    1612
Name: count, dtype: int64

In [50]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(x_train_vectorized, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(9448, 1124) (9448,)


In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [52]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')
    
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

In [53]:
lr = LogisticRegression()
lr.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = lr.predict(vectorized_x_train_smote)

y_test_pred = lr.predict(x_test_vectorized)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.942
	Precision = 0.917
	Recall = 0.973
	F1-Score = 0.944
Testing Scores:
	Accuracy = 0.852
	Precision = 0.681
	Recall = 0.819
	F1-Score = 0.743


In [54]:
mnb = MultinomialNB()
mnb.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = mnb.predict(vectorized_x_train_smote)

y_test_pred = mnb.predict(x_test_vectorized)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.905
	Precision = 0.869
	Recall = 0.955
	F1-Score = 0.91
Testing Scores:
	Accuracy = 0.852
	Precision = 0.655
	Recall = 0.92
	F1-Score = 0.765


In [55]:
dt = DecisionTreeClassifier()

dt.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = dt.predict(vectorized_x_train_smote)

y_test_pred = dt.predict(x_test_vectorized)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.999
	Precision = 1.0
	Recall = 0.999
	F1-Score = 0.999
Testing Scores:
	Accuracy = 0.813
	Precision = 0.646
	Recall = 0.63
	F1-Score = 0.638


In [56]:
rf = RandomForestClassifier()

rf.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = rf.predict(vectorized_x_train_smote)

y_test_pred = rf.predict(x_test_vectorized)

training_scores(y_train_smote, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.999
	Precision = 0.999
	Recall = 0.999
	F1-Score = 0.999
Testing Scores:
	Accuracy = 0.846
	Precision = 0.716
	Recall = 0.681
	F1-Score = 0.698


In [59]:
import pickle

with open('./../static/model/model.pickle', 'wb') as file:
    pickle.dump(lr, file)