                                                                data: https://www.kaggle.com/kazanova/sentiment140
# Tweets Sentiment Analysis #

# Data Prep

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('sentiment140data.csv', header=None, encoding='latin1')
df.columns = ['target', 'id', 'date', 'query', 'user_name', 'tweet']
#df = df.sample(frac=0.2, random_state=1)

df.head()

Unnamed: 0,target,id,date,query,user_name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [2]:
df = df[['target', 'tweet']]
df.head()
df['target'].value_counts()

4    800000
0    800000
Name: target, dtype: int64

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'],
                                                    df['target'],
                                                    test_size=0.2,
                                                    random_state=0)

In [4]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 @stefany_nyappy thx  i like ur pic as well!!


X_train shape:  (1280000,)


# Bag of Words

In [5]:
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range=(1,2),
                             min_df=10,
                             strip_accents='unicode')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
X_train_vectorized
len(vectorizer.get_feature_names())

109212

# Training classifier #

In [6]:
def classify(classifier, vectorizer):

    classifier.fit(X_train_vectorized, y_train)

    predictions = classifier.predict(X_train_vectorized)
    print('Train Accuracy: ', accuracy_score(y_train, predictions))
    cm = confusion_matrix(y_train, predictions)
    print(cm)

    predictions = classifier.predict(X_test_vectorized)
    print('Test Accuracy: ', accuracy_score(y_test, predictions))
    cm = confusion_matrix(y_test, predictions)
    print(cm)

In [7]:
def find_best_features(classifier, vactorizer):
    feature_names = np.array(vectorizer.get_feature_names())
    sorted_coef_index = classifier.coef_[0].argsort()
    print('Smallest Coefs:\n{}\n'.format(
        feature_names[sorted_coef_index[:10]]))
    print('Largest Coefs: \n{}'.format(
        feature_names[sorted_coef_index[:-11:-1]]))

In [8]:
classifier_mnb = MultinomialNB()
classify(classifier_mnb, vectorizer)
find_best_features(classifier_mnb, vectorizer)

Train Accuracy:  0.79225
[[507966 132219]
 [133701 506114]]
Test Accuracy:  0.77249375
[[123585  36230]
 [ 36572 123613]]
Smallest Coefs:
['feeling low' 'shit miss' 'having issue' 'camera charger' 'sucks haven'
 'sigh hate' 'longer just' 'longer sunny' 'came bad' 'wishing watch']

Largest Coefs: 
['good' 'thanks' 'love' 'just' 'http' 'day' 'quot' 'lol' 'com' 'like']


In [9]:
classifier_log = LogisticRegression(max_iter=1000)
classify(classifier_log, vectorizer)

Train Accuracy:  0.8125171875
[[507300 132885]
 [107093 532722]]
Test Accuracy:  0.789821875
[[122813  37002]
 [ 30255 129930]]


In [10]:
pickle.dump(classifier_log, open('models/logistic_model.sav', 'wb'))
pickle.dump(vectorizer, open('models/vectorizer.sav', 'wb'))