In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data=pd.read_csv("../data/sentiment_analysis.csv")
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


## Data Preprocessing

In [4]:
# check duplicates
data.duplicated().sum()

np.int64(0)

In [5]:
# check null values
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

### Text Preprocessing

In [6]:
# Convert Uppercase to Lowercase
# Remove Links
# Remove Puncatuations (?><"")
# Remove Numbers
# Remove Stopwords ("and, by ,the ,is ,in ,about")
# Stemming (create ,careating ,creates careated ) get the base word

In [7]:
import re
import string

##### Convert uppercaset to lowercase

In [8]:
data["tweet"]=data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test https://goo.gl/h1...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


##### Remove Links

In [9]:
data["tweet"]=data["tweet"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test #android #apps #...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


##### Remove Punctuations

In [10]:
def remove_functuations (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,'')
    return text

data["tweet"]=data["tweet"].apply(remove_functuations)
data.head()

#string.punctuation
#'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beaut...
1,2,0,finally a transparant silicon case thanks to ...
2,3,0,we love this would you go talk makememories un...
3,4,0,im wired i know im george i was made that way ...
4,5,1,what amazing service apple wont even talk to m...


##### Remove Numbers

In [11]:
data["tweet"] = data["tweet"].str.replace(r'\d+', '', regex=True)
data["tweet"].tail()

7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

##### Remove Stopwords

In [12]:
# !pip install nltk
import nltk
nltk.download('stopwords',download_dir='../static/module')

[nltk_data] Downloading package stopwords to ../static/module...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
with open('../static/model/corpora/stopwords/english','r') as file :
    sw =file.read().splitlines()


In [14]:
data["tweet"]=data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
data.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally transparant silicon case thanks uncle ...
2,3,0,love would go talk makememories unplug relax i...
3,4,0,im wired know im george made way iphone cute d...
4,5,1,amazing service apple wont even talk question ...


##### Stemming

In [15]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

data["tweet"]=data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
data.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love would go talk makememori unplug relax iph...
3,4,0,im wire know im georg made way iphon cute dave...
4,5,1,amaz servic appl wont even talk question unles...


In [16]:
#   Text 
#    |
# Text Preprocessing
#    |
# Vectorizations(Create Vocubulary then convert vector)

#### Create Vocubulary

In [17]:
from collections import Counter
vocab=Counter()

In [18]:
# vocab.update(['java',"python",'C++',"js",'python'])
# vocab

# output :  Counter({'python': 2, 'java': 1, 'C++': 1, 'js': 1})

In [31]:
for sentence in data['tweet']:
    vocab.update(sentence.split())

In [20]:
# vocab

In [32]:
len(vocab)

15949

In [22]:
data.shape      # vector has 15952 features ,it is affected to overfitting
# records >features

(7920, 3)

In [23]:
# Get features more than 10 existence
tokens =[key for key in vocab if vocab[key]>10]
len(tokens)

1145

In [33]:
def save_vocabulary(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w',encoding ="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens,'../static/model/vocabulary.txt')

### Divide Dataset

In [34]:
# !pip install scikit-learn
X=data["tweet"]
Y=data["label"]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split(X,Y,test_size=0.2)

In [35]:
x_train.shape
x_test.shape

(1584,)

### Vectorization

In [36]:
def vectorizar(ds,vocabulary) :
    vectorized_list =[]

    for sentence in ds :
        sentence_lst = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] =1

        vectorized_list.append((sentence_lst))
    vectorized_lst_new =np.asarray(vectorized_list,dtype=np.float32)
    return vectorized_lst_new

In [37]:
vectorized_x_train = vectorizar(x_train,tokens)
vectorized_x_test = vectorizar(x_test,tokens)

### Handle Imabalanced Dataset

In [39]:
y_train.value_counts() # imbalanced 

label
0    4700
1    1636
Name: count, dtype: int64

In [41]:
#Smote technique
# ! pip install imbalanced-learn

from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote , y_train_smote=smote.fit_resample(vectorized_x_train,y_train)

In [42]:
y_train_smote.value_counts()

label
0    4700
1    4700
Name: count, dtype: int64

In [None]:
#vectorized_x_train_smote , y_train_smote
#vectorized_x_test,y_test

## Model Training and Evaluation

In [43]:
from  sklearn.linear_model import LogisticRegression
from  sklearn.naive_bayes import MultinomialNB
from  sklearn.tree import DecisionTreeClassifier
from  sklearn.ensemble import RandomForestClassifier
from  sklearn.svm  import SVC

In [44]:
from sklearn.metrics import accuracy_score ,f1_score,precision_score,recall_score

def training_scores (y_act,y_pred) :
    acc =round(accuracy_score(y_act,y_pred),3)
    pr =round(precision_score(y_act,y_pred),3)
    rec =round(recall_score(y_act,y_pred),3)
    f1=round(f1_score(y_act,y_pred),3)
    print(f"Training Scores :\n\tAccuracy ={acc}\n\tPrecision ={pr}\n\tRecall ={rec}\n\tF1-score ={f1}")

def validation_scores (y_act,y_pred) :
    acc =round(accuracy_score(y_act,y_pred),3)
    pr =round(precision_score(y_act,y_pred),3)
    rec =round(recall_score(y_act,y_pred),3)
    f1=round(f1_score(y_act,y_pred),3)
    print(f"Testing  Scores :\n\tAccuracy ={acc}\n\tPrecision ={pr}\n\tRecall ={rec}\n\tF1-score ={f1}")

### Logistic Regression

In [45]:
lr=LogisticRegression()
lr.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred=lr.predict(vectorized_x_train_smote)
y_test_pred=lr.predict(vectorized_x_test)

#training accuracy
training_scores(y_train_smote,y_train_pred)

# test accuracy
validation_scores(y_test,y_test_pred)

Training Scores :
	Accuracy =0.939
	Precision =0.914
	Recall =0.968
	F1-score =0.941
Testing  Scores :
	Accuracy =0.871
	Precision =0.697
	Recall =0.838
	F1-score =0.761


### Naive Bayes 

In [46]:

mnb=MultinomialNB()
mnb.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred=mnb.predict(vectorized_x_train_smote)
y_test_pred=mnb.predict(vectorized_x_test)

#training accuracy
training_scores(y_train_smote,y_train_pred)

# test accuracy
validation_scores(y_test,y_test_pred)


Training Scores :
	Accuracy =0.904
	Precision =0.869
	Recall =0.951
	F1-score =0.908
Testing  Scores :
	Accuracy =0.878
	Precision =0.681
	Recall =0.949
	F1-score =0.793


### Decsion Tree

In [48]:
dt=DecisionTreeClassifier()
dt.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred=dt.predict(vectorized_x_train_smote)
y_test_pred=dt.predict(vectorized_x_test)

#training accuracy
training_scores(y_train_smote,y_train_pred)

# test accuracy
validation_scores(y_test,y_test_pred)


Training Scores :
	Accuracy =1.0
	Precision =1.0
	Recall =0.999
	F1-score =1.0
Testing  Scores :
	Accuracy =0.841
	Precision =0.672
	Recall =0.692
	F1-score =0.682


### RandomForest 

In [49]:
rf = RandomForestClassifier()
rf.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred=rf.predict(vectorized_x_train_smote)
y_test_pred=rf.predict(vectorized_x_test)

#training accuracy
training_scores(y_train_smote,y_train_pred)

# test accuracy
validation_scores(y_test,y_test_pred)


Training Scores :
	Accuracy =1.0
	Precision =1.0
	Recall =1.0
	F1-score =1.0
Testing  Scores :
	Accuracy =0.878
	Precision =0.763
	Recall =0.733
	F1-score =0.748


### Support Vector Machine

In [50]:
sv = RandomForestClassifier()
sv.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred=sv.predict(vectorized_x_train_smote)
y_test_pred=sv.predict(vectorized_x_test)

#training accuracy
training_scores(y_train_smote,y_train_pred)

# test accuracy
validation_scores(y_test,y_test_pred)

Training Scores :
	Accuracy =1.0
	Precision =1.0
	Recall =1.0
	F1-score =1.0
Testing  Scores :
	Accuracy =0.871
	Precision =0.75
	Recall =0.715
	F1-score =0.732


In [51]:
import pickle 

with open("../static/model/model.pickle",'wb') as file :
    pickle.dump(lr,file)