In [117]:
# SWAMI KARUPPASWAMI THUNNAI

In [118]:
# Import all the necessary libararies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from nltk.tokenize import TweetTokenizer
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [119]:
# Get the data-set
data_set = pd.read_csv("processed.csv")

In [120]:
# This is our independent variable - The tweet data
# Independent variable does not depend on anything.
# Here in our case the Tweet text does not depend on anything.
X = data_set.iloc[:, :-1]

In [121]:
# This is our dependent variable - Positive Negative
# It is a dependent variable because the sentiment is dependant on the tweet
y = data_set.iloc[:, -1]

In [122]:
# Convert the Categroical data (negative and positive) to numerical data 0 and 1
# Generally any equations cannot process text so we need to encode the text to numbers.
# LabelEncoder does encodes the categorical data here two categories(negative, positive)
# to numerical data 0 and 1. The categorical data is converted into numerical data alphabetically.
# There are three common methods used here: fit, fit_transfrom, transform in this LabelEncoder class
# method: fit = > will not modify the data but will convert them into numerical value into memory.
# method: transform => will use numerical values from the memory to convert the text data to numerical data.
# method: fit_transform  => does both fit and transform
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [123]:
# The list of preprocessed tweets
corpus = []

In [124]:
def tweet_preprocessor(tweet_text):
    """
    Will preprocess the tweets like remove the usernames, hashtags, urls and
    will preprocess the content suitable for NLP.
    :return: preprocessed tweet
    """
    # convert the tweet to the lower case
    tweet_text = tweet_text.lower()
    tokenizer = TweetTokenizer()
    words = tokenizer.tokenize(tweet_text)
    preprocessed_words = []
    for word in words:
        if word.startswith("@") or word.startswith("#") or word.startswith("https://") or word.startswith("http://"):
            pass
        else:
            preprocessed_words.append(word)
    # remove the stop words
    stopwords_removed = [word for word in preprocessed_words if not word in stopwords.words("english")]
    # make sure they are words
    punc_remover = lambda word: re.sub("[^A-Za-z]", " ", word)
    # get the pure words without punctuation
    pure_words = list(map(punc_remover, stopwords_removed))
    # Stem the words
    # Stemming is the process of converting words into their root form
    # For example: loving, loved will be converted to love
    # The resulting root word may not be English word
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in pure_words]
    # strip the words
    stripper = lambda word: word.strip()
    stemmed_words = list(map(stripper, stemmed_words))
    stemmed_words = filter(None, stemmed_words)
    # Get our processed tweet
    processed_tweet = " ".join(stemmed_words)
    return processed_tweet
    

In [125]:
result = list(X["Tweet"]) # we are getting as a list
    

In [126]:
for i in result:
    value = tweet_preprocessor(i)
    corpus.append(value)
print("Completed")

Completed


In [127]:
# Count vectorizer creates the bag of words model
# 1500 most common words is taken
cv = CountVectorizer(max_features=1500)

In [128]:
# All the text data is converted into bag of words model
X = cv.fit_transform(corpus).toarray()
# Scale the model
sc = StandardScaler()
X = sc.fit_transform(X)



In [129]:
# Split 80% to traning and 20% for testing
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [130]:
SVMClassifier=SVC()

In [131]:
SVMClassifier.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [132]:
Y_pred = SVMClassifier.predict(X_test)

In [133]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred) 

0.6822572445348246

In [134]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_pred)

array([[659, 296],
       [329, 683]])

In [138]:
print("Predict The sentiment")
data = input("Enter your data to get the sentiment: ")
data = [data,]
# Notice here: we are using transform. as said transform will use the memory
# created by fit to change the data. So here we are using the existing bag of words model.
array = cv.transform(data).toarray()

Predict The sentiment
Enter your data to get the sentiment: I am happy


In [139]:
r = SVMClassifier.predict(array)

In [140]:
print(r)

[1]
