In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# laod the data file
data = pd.read_csv('../Artifact/sentiment_analysis.csv')

In [3]:
data.head()


Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


## Data Preprocessing

In [4]:
data.shape


(7920, 3)

In [5]:
data.duplicated().sum()

0

In [6]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

#### Text Preprocessing

In [7]:
import re
import string

In [8]:
data["tweet"].head(5)

0    #fingerprint #Pregnancy Test https://goo.gl/h1...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object

In [9]:
#convert upercase to lowercase
data ["tweet"] = data["tweet"].apply(lambda x: " ". join(x.lower() for x in x.split()))

In [10]:
data["tweet"].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [11]:
# remove links
data ["tweet"] = data["tweet"].apply(lambda x: " ". join(re.sub(r'^https?:\/\/.*[\r\n]*', '',x, flags = re.MULTILINE) for x in x.split()))

In [12]:
data["tweet"].head(5)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [13]:
#remove puncuations

def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuation)

In [14]:
data["tweet"].head(5)

0    fingerprint pregnancy test  android apps beaut...
1    finally a transparant silicon case  thanks to ...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
Name: tweet, dtype: object

In [15]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    i am completely in love with the new iphone em...
7912    tune in turn on drop out  gtd in one app  mobi...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely 90 year old neighbor with he...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

In [16]:
#remove numbers
data["tweet"] = data["tweet"].str.replace('\d+','',regex=True)

In [17]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    i am completely in love with the new iphone em...
7912    tune in turn on drop out  gtd in one app  mobi...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

### remove stopwords

In [18]:
import nltk

In [20]:
nltk.download('stopwords',download_dir='../Static/Model')

[nltk_data] Downloading package stopwords to ../Static/Model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
with open('../Static/Model\corpora\stopwords/english','r') as file: 
  sw = file.read().splitlines()

In [22]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
data["tweet"]= data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [24]:
data["tweet"].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    im wired know im george made way iphone cute d...
4    amazing service apple wont even talk question ...
Name: tweet, dtype: object

###### stemming

In [25]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [26]:
data["tweet"]= data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [27]:
data["tweet"].head()

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    im wire know im georg made way iphon cute dave...
4    amaz servic appl wont even talk question unles...
Name: tweet, dtype: object

### Building Vocabulary

In [29]:
from collections import Counter
vocab = Counter()

In [30]:
for sentence in data["tweet"]:
  vocab.update(sentence.split())

In [32]:
len(vocab)

15949

In [33]:
tokens = [key for key in vocab if vocab[key] > 10]

In [34]:
len(tokens)

1145

In [35]:
def save_vocabulary(lines,filename):
  data= '\n'.join(lines)
  file = open(filename,'w', encoding="utf-8")
  file.write(data)
  file.close()
save_vocabulary(tokens, r'E:\MY_PROJECTS\sentiment_analysis-ML-project\Static\Model/Vocabulary.txt')


### Divide DataSet

In [36]:
X= data["tweet"]
Y=data["label"]


In [37]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.2)

In [38]:
x_train.shape

(6336,)

In [39]:
x_test.shape

(1584,)

In [40]:
y_train.shape

(6336,)

In [41]:
y_test.shape

(1584,)

### Vectorization

In [43]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []

    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1

        vectorized_lst.append(sentence_lst)

    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)

    return vectorized_lst_new

In [44]:
vectorized_x_train = vectorizer(x_train,tokens)

In [46]:
vectorized_x_test = vectorizer(x_test,tokens)

In [47]:
y_train.value_counts()

label
0    4721
1    1615
Name: count, dtype: int64

##### vectorization of labels

In [48]:
from sklearn.preprocessing import LabelBinarizer

label_binarizer = LabelBinarizer()

vectorized_y_train = label_binarizer.fit_transform(y_train)
vectorized_y_test = label_binarizer.transform(y_test)



In [49]:
print(vectorized_y_train.shape)

(6336, 1)


In [50]:
y_train.value_counts()

label
0    4721
1    1615
Name: count, dtype: int64

In [51]:
class_counts = y_train.value_counts()

### Handled Imbalanced Data

In [52]:
print(vectorized_x_train.shape, y_train.shape)

(6336, 1145) (6336,)


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE


vectorizer = TfidfVectorizer()
vectorized_x_train = vectorizer.fit_transform(x_train)
print(vectorized_x_train.shape, y_train.shape)

smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)

print(vectorized_x_train_smote.shape, y_train_smote.shape)


(6336, 13244) (6336,)
(9442, 13244) (9442,)


In [54]:
y_train_smote.value_counts()

label
0    4721
1    4721
Name: count, dtype: int64

In [55]:
vectorized_x_train_smote

<9442x13244 sparse matrix of type '<class 'numpy.float64'>'
	with 118127 stored elements in Compressed Sparse Row format>

In [56]:
y_train_smote

0       0
1       1
2       0
3       0
4       1
       ..
9437    1
9438    1
9439    1
9440    1
9441    1
Name: label, Length: 9442, dtype: int64

In [57]:
vectorized_x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [58]:
y_test  

6873    0
5059    0
5413    0
7436    0
5710    0
       ..
4573    0
4775    0
6779    0
317     0
652     0
Name: label, Length: 1584, dtype: int64

## Model Training and Evaluation

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

In [62]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def print_scores(prefix, y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)

    print(f'{prefix} Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1.Score = {f1}')

def training_scores(y_train, y_train_pred):
    print_scores('Training', y_train, y_train_pred)

def validation_scores(y_test, y_test_pred):
    print_scores('Validation', y_test, y_test_pred)


## Logistic Regression

In [63]:
lr= LogisticRegression()
lr.fit(vectorized_x_train_smote, y_train_smote);

y_train_pred = lr.predict(vectorized_x_train_smote)
training_scores(y_train_smote, y_train_pred)

vectorizer = TfidfVectorizer()
vectorized_x_train = vectorizer.fit_transform(x_train)
vectorized_x_test = vectorizer.transform(x_test)
smote_test = SMOTE()
vectorized_x_test_smote, y_test_smote = smote_test.fit_resample(vectorized_x_test, y_test)

y_test_pred = lr.predict(vectorized_x_test)
validation_scores(y_test, y_test_pred)



Training Scores:
	Accuracy = 0.955
	Precision = 0.938
	Recall = 0.974
	F1.Score = 0.956
Validation Scores:
	Accuracy = 0.879
	Precision = 0.73
	Recall = 0.849
	F1.Score = 0.785


## Naive Bayes

In [64]:

mnb= MultinomialNB()
mnb.fit(vectorized_x_train_smote, y_train_smote);

y_train_pred = mnb.predict(vectorized_x_train_smote)
training_scores(y_train_smote, y_train_pred)

vectorizer = TfidfVectorizer()
vectorized_x_train = vectorizer.fit_transform(x_train)
vectorized_x_test = vectorizer.transform(x_test)
smote_test = SMOTE()
vectorized_x_test_smote, y_test_smote = smote_test.fit_resample(vectorized_x_test, y_test)

y_test_pred = mnb.predict(vectorized_x_test)
validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.937
	Precision = 0.893
	Recall = 0.994
	F1.Score = 0.94
Validation Scores:
	Accuracy = 0.866
	Precision = 0.67
	Recall = 0.956
	F1.Score = 0.788


## Decision Trees

In [65]:
dt= DecisionTreeClassifier()
dt.fit(vectorized_x_train_smote, y_train_smote);

y_train_pred = dt.predict(vectorized_x_train_smote)
training_scores(y_train_smote, y_train_pred)

vectorizer = TfidfVectorizer()
vectorized_x_train = vectorizer.fit_transform(x_train)
vectorized_x_test = vectorizer.transform(x_test)
smote_test = SMOTE()
vectorized_x_test_smote, y_test_smote = smote_test.fit_resample(vectorized_x_test, y_test)

y_test_pred = dt.predict(vectorized_x_test)
validation_scores(y_test, y_test_pred)


Training Scores:
	Accuracy = 1.0
	Precision = 1.0
	Recall = 0.999
	F1.Score = 1.0
Validation Scores:
	Accuracy = 0.829
	Precision = 0.661
	Recall = 0.701
	F1.Score = 0.68


## Random Forest

In [66]:
rf= RandomForestClassifier()
rf.fit(vectorized_x_train_smote, y_train_smote);

y_train_pred = rf.predict(vectorized_x_train_smote)
training_scores(y_train_smote, y_train_pred)

vectorizer = TfidfVectorizer()
vectorized_x_train = vectorizer.fit_transform(x_train)
vectorized_x_test = vectorizer.transform(x_test)
smote_test = SMOTE()
vectorized_x_test_smote, y_test_smote = smote_test.fit_resample(vectorized_x_test, y_test)

y_test_pred = rf.predict(vectorized_x_test)
validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 1.0
	Precision = 1.0
	Recall = 1.0
	F1.Score = 1.0
Validation Scores:
	Accuracy = 0.89
	Precision = 0.791
	Recall = 0.783
	F1.Score = 0.787


## Support Vector Machine

In [67]:
svm= SVC()
svm.fit(vectorized_x_train_smote, y_train_smote);

y_train_pred = svm.predict(vectorized_x_train_smote)
training_scores(y_train_smote, y_train_pred)

vectorizer = TfidfVectorizer()
vectorized_x_train = vectorizer.fit_transform(x_train)
vectorized_x_test = vectorizer.transform(x_test)
smote_test = SMOTE()
vectorized_x_test_smote, y_test_smote = smote_test.fit_resample(vectorized_x_test, y_test)

y_test_pred = svm.predict(vectorized_x_test)
validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.994
	Precision = 0.991
	Recall = 0.999
	F1.Score = 0.995
Validation Scores:
	Accuracy = 0.888
	Precision = 0.809
	Recall = 0.742
	F1.Score = 0.774


In [None]:
import pickle
with open('../Static/Model/model.pickel','wb') as file:
  pickle.dump(lr,file)