# Step 1: Open Tweets.csv & extract features and labels

In [1]:
import os
ROOT_DIR = os.path.dirname(os.path.abspath("__file__")) # Project Root
print(ROOT_DIR)

C:\Users\voice\ml_webapp


In [2]:
import numpy
import csv

def TweetExtractor(csv_file):
    tweets = []
    labels = []
    with open(csv_file, encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        try:
            for index, row in enumerate(csv_reader):
                if index > 0:
                    sentiment = row[1]
                    sentiment_conf = float(row[2])
                    transcript = row[10]
                    if sentiment_conf >= 0.80:
                        tweets.append(transcript)
                        labels.append(sentiment)
                    else:
                        ("low conf")
                    
                else:
                    pass
        except Exception as e:
            print("Exception: {}".format(e))
        
    tweets = numpy.array(tweets)
    labels = numpy.array(labels)
    
    return tweets, labels

tweets, labels = TweetExtractor(ROOT_DIR + "/dataset/Tweets.csv")

# Plot label distribution

In [3]:
label_dist, label_dist_count = numpy.unique(labels, return_counts = True)
label_dist = label_dist.astype("str")

print(label_dist_count)
print(label_dist)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.set_ylabel('Count')
ax.set_title('Label distribution')
ax.bar(label_dist,label_dist_count)
plt.show()

[7392 1550 1517]
['negative' 'neutral' 'positive']


<Figure size 640x480 with 1 Axes>

# Step 2: Remove duplicates

In [4]:
def DuplicateRemover(tweets, labels):
    
    # index_list = List of all unique indexes in tweets list
    __, index_list = numpy.unique(tweets, return_index=True, axis = 0)
    
    # Set empty lists for unique tweets(tweet + label)
    unique_tweets = []
    unique_labels = []

    for index, (transcript, label) in enumerate(zip(tweets, labels)):

        # if index in index_list
        if index in index_list:
            unique_tweets.append(transcript)
            unique_labels.append(label)

        else:
            pass

    unique_tweets = numpy.array(unique_tweets)
    unique_labels = numpy.array(unique_labels)
    
    return unique_tweets, unique_labels

unique_tweets, unique_labels = DuplicateRemover(tweets, labels)

# Step 3: Clean tweets
- lowercase
- remove punctuations
- remove white spaces

In [5]:
import string
import re 

def processor(X_data_array):
    clean_data_array = []
    for sentence in X_data_array:
        # Lowercase
        sentence = sentence.lower()
        
        # Remove punctuations
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        
        # Remove white spaces
        sentence = re.sub(' +', ' ',sentence).strip()
        
        # Append cleaned sentences
        clean_data_array.append(sentence)

    clean_data_array = numpy.array(clean_data_array)
    return clean_data_array


unique_tweets = processor(unique_tweets)

print("training phrase example after data cleaning: \n{}".format(unique_tweets[1]))

training phrase example after data cleaning: 
virginamerica its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse


# Step 4: TF-IDF Vectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer = vectorizer.fit(unique_tweets)
train_features = vectorizer.transform(unique_tweets)
train_features = train_features.toarray()

# Save fitted vectorizer to disk

In [7]:
import pickle

filename = 'TFIDF_Vectorizer'
pickle.dump(vectorizer, open(ROOT_DIR + "/app/processors/" + filename, 'wb'))

# Checkpoint 

In [8]:
print("shape of train_features: {}".format(train_features.shape))
print("example of vectorized training data: {}".format(train_features[0]))

shape of train_features: (10356, 5000)
example of vectorized training data: [0. 0. 0. ... 0. 0. 0.]


# Step 5: Split data in train and test

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(train_features, unique_labels, 
                                                  test_size = 0.20, 
                                                  random_state = 1, 
                                                  stratify = unique_labels)

# Step 6: Classification algorithms

# SVM model

In [11]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

clf = SVC(kernel='linear', probability=True)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_val)
print(accuracy_score(y_val,y_pred))

0.8798262548262549


# Model evaluation

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))
print()
print(accuracy_score(y_val,y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.97      0.93      1466
     neutral       0.75      0.62      0.68       308
    positive       0.91      0.73      0.81       298

    accuracy                           0.88      2072
   macro avg       0.85      0.77      0.81      2072
weighted avg       0.88      0.88      0.87      2072

[[1415   41   10]
 [ 107  190   11]
 [  58   22  218]]

0.8798262548262549


# Save model to disk

In [13]:
filename = 'SVM_model_tfidf'
pickle.dump(clf, open(ROOT_DIR + "/app/models/" + filename, 'wb'))