In [1]:
#Standard Imports

import re
import numpy as np
import pandas as pd
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [3]:
# Data Loading
data = pd.read_csv('train.csv', encoding="ISO-8859-1")

data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [6]:
# Data Cleaning
data = data.dropna()

In [125]:
import nltk
nltk.data.path = ['nltk_data']

#Creating a set of stopwords
from nltk.corpus import stopwords

STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))

In [139]:
# Creating all the functions required for cleaning data: 
# Lower casing, appostrophe Lookup, removing links, parsing hashtags, parsing mentions

APPOSTROPHES = {
    "don't"   : "do not",
    "won't"   : "will not",
    "it's"    : "it is",
    "can't"   : "can not",
    "i'll"    : "i will",
    "i've"    : "i have",
    "you're"  : "you are",
    "didn't"  : "did not",
    "she's"   : "she is",
    "they're" : "they are",
    "we're"   : "we are",
    "you've"  : "you have",
    "aren't"  : "are not",
    "she'd"   : "she would",
    "he'd"    : "he would",
    "let's"   : "let us",
    "we've"   : "we have",
    "couldn't": "could not",
    "who's"   : "who is",
    "i'd"     : "i would",
    "i'm"     : "i am",
    "you'll"  : "you will",
    "isn't"   : "is not",
    "that's"  : "that is",
    "wouldn't": "would not",
    "doesn't" : "does not",
    "there's" : "there is",
    "we'll"   : "we will",
    "dont"    : "do not"
    }


def appostropheLookup(tweet):
    words = tweet.split()
    reformed = [APPOSTROPHES[word] if word in APPOSTROPHES else word for word in words]
    reformed = " ".join(reformed)
    return reformed


def removeLinks(tweet):
    links_re = re.compile("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)")
    tweet = links_re.sub(" ",tweet)
    tweet = ' '.join(tweet.split())
    return tweet



# Returns a list of common english terms (words)
def initialize_words():
    content = None
    with open('words_alpha.txt') as f: # A file containing common english words
        content = f.readlines()
    return [word.rstrip('\n') for word in content]


def parse_sentence(sentence, wordlist):
    new_sentence = "" # output    
    terms = sentence.split()    
    for term in terms:
        if term[0] == '#': # this is a hashtag, parse it
            new_sentence += parse_tag(term, wordlist)
        else: # Just append the word
            new_sentence += term
        new_sentence += " "

    return new_sentence 


def parse_tag(term, wordlist):
    words = []
    # Remove hashtag, split by dash
    tags = term[1:].split('-')
    for tag in tags:
        word = find_word(tag, wordlist)    
        while word != None and len(tag) > 0:
            words.append(word)            
            if len(tag) == len(word): # Special case for when eating rest of word
                break
            tag = tag[len(word):]
            word = find_word(tag, wordlist)
    return " ".join(words)

def find_word(token, wordlist):
    i = len(token) + 1
    while i > 1:
        i -= 1
        if token[:i] in wordlist:
            return token[:i]
    return None 


In [140]:
# Creating a preprocessor function to call all the the individual functions for cleaning the data

def preprocess(tweet):
    clean_tweet = tweet
    
    clean_tweet = removeLinks(clean_tweet)
    
    wordlist = initialize_words()
    
    clean_tweet = parse_sentence(clean_tweet, wordlist)
    
    mention_finder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    clean_tweet = mention_finder.sub("@MENTION", clean_tweet)
    
    clean_tweet = clean_tweet.lower()
     
    clean_tweet = appostropheLookup(clean_tweet)
    
    #clean_tweet = clean_tweet.decode("utf8").encode('ascii', 'ignore')
    
    return clean_tweet


In [141]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

In [142]:
stemmer = SnowballStemmer("english")

def stem(doc):
    return (stemmer.stem(w) for w in doc)

In [128]:
en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    preprocessor = preprocess,
    lowercase = True,
    ngram_range=(1, 2),
    stop_words = en_stopwords)

In [143]:
# Machine Learning Model

train, test = train_test_split(data, test_size=0.2, random_state=1)
X_train = train['SentimentText'].values
X_test = test['SentimentText'].values
y_train = train['Sentiment']
y_test = test['Sentiment']

In [144]:
pipe = make_pipeline(vectorizer, SVC(probability=True,
                            kernel="linear", class_weight="balanced"))
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2),
        preprocessor=<function preproc...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [146]:
predicted = pipe.predict(X_test)

In [147]:
from sklearn.metrics import accuracy_score

print("Accuracy score: ", accuracy_score(y_test, predicted))

Accuracy score:  0.734062469208789


In [95]:
pipe.predict(["Terrible experience @HomeDepot Hollywood for kitchen remodel, with multiple problems and delays! No sense of urgency"])

array([0.])

In [96]:
pipe.predict(["I had a good experience at @HomeDepot yesterday. Amazing customer service"])

array([1.])

In [97]:
pipe.predict(["OK OK experience at @HomeDepot "])

array([0.])

In [102]:
pipe.predict(["Rude staff "])

array([0.])

In [98]:
pipe.predict(["FINE experience at @HomeDepot "])

array([0.])

In [99]:
pipe.predict(["fine experience at @HomeDepot "])

array([0.])

In [100]:
pipe.predict(["fine "])

array([0.])

In [101]:
pipe.predict(["moderate excellent bad experience at @HomeDepot "])

array([0.])

In [105]:
pipe.predict(["friends are leaving me 'cause of this stupid love  http://bit.ly/ZoxZC"])

array([0.])

In [106]:
pipe.predict(["Thanks, I need all the help i can get."])

array([1.])

In [108]:
pipe.predict(["@haugern The servers are now backup, if you experience any more problems then please let me know  Sorry about the delay..."])

array([0.])

In [110]:
pipe.predict_proba(["I wanted to sleep in this morning but a mean kid through a popsicle stick at me head.I wish I could fly away like those squirrels"])

array([[0.97112116, 0.02887884]])

In [201]:
import tweepy
import datetime
from datetime import *

consumerKey = ''
consumerSecret = ''
accessToken = ''
accessTokenSecret = ''

auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
startDate=(datetime.now().replace(microsecond=0)) + timedelta(hours=4)
endDate =   startDate-timedelta(minutes=5)
api = tweepy.API(auth)
list_tweets = []
for mentions in tweepy.Cursor(api.mentions_timeline).items():
    if mentions.created_at>endDate and  mentions.created_at <startDate:
        tweet = {'id': mentions.user.id_str, 'name': mentions.user.screen_name, 'text':mentions.text }
        print(tweet)
        list_tweets.append(tweet)

{'id': '1038232723570212864', 'name': 'AlokitaG', 'text': '@BrarRohin : Worst and pathetic service ever, I ordered headset and it is not working. I have been trying to be in… https://t.co/0OOXxlLN6c'}


In [203]:
for item in list_tweets:
    prediction = pipe.predict([item['text']])
    if 1 in prediction:
        api.update_status("@" + item['name'] + " Thank you for your feedback. We look forward to serve you again soon. Meanwhile, please follow the link to help us serve you better. https://bit.ly/2N36dgd <3", in_reply_to_status_id = item['id'])
    elif 0 in prediction:
        api.update_status("@" + item['name'] + " We are terribly sorry to hear your experience with us. We value you and our customer support team will try to resolve the isssue as soon as possible. Meanwhile, please follow the link to help us serve you better. https://bit.ly/2N36dgd", in_reply_to_status_id = item['id'])
        