In [None]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
import re
import string
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
# Definitions
def remove_pattern(input_txt,pattern):
    r = re.findall(pattern,input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

In [None]:
data = pd.read_csv("sentiment.tsv",sep = '\t')
data.columns = ["label","body_text"]

In [None]:
# Features and Labels

# Map the label to their respective pos and neg
data['label'] = data['label'].map({'pos': 0, 'neg': 1})

# we vectorise the input after treating with removing patterns
data['tidy_tweet'] = np.vectorize(remove_pattern)(data['body_text'],"@[\w]*")

# split the string inputs
tokenized_tweet = data['tidy_tweet'].apply(lambda x: x.split())

# stem the inputs so that it create unbiased opinion for given model.
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

# Save the tokenized tweet, body length and punctations if they have
data['tidy_tweet'] = tokenized_tweet
data['body_len'] = data['body_text'].apply(lambda x:len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x:count_punct(x))
X = data['tidy_tweet']
y = data['label']
print(type(X))

<class 'pandas.core.series.Series'>


In [None]:
# Extract Feature With CountVectorizer
cv = CountVectorizer()

X = cv.fit_transform(X) # Fit the Data

filename = 'count_vectoriser.sav'
pickle.dump(cv, open(filename, 'wb'))

# concat all the 3 values to form a one input
X = pd.concat([data['body_len'],data['punct%'],pd.DataFrame(X.toarray())],axis = 1)

#Split the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Using Classifier
clf = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
clf.fit(X_train.values,y_train)

In [None]:
filename = 'model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_test.values)
print(result)

[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1 0 1 0 0 0 0
 0 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1
 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 1
 0 1 1 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 1 0 0 1 1 1 1 0 1
 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0
 1 1 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 0 0 1 0 0 1 1 1 0 0 1 0 1 1 1
 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 0 0
 1 0 1 0 0 0 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1
 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 1 1
 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0 1 0 1 0 0 1
 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 1
 1 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 0
 1 0 0 0 1 1 1 1 0 0 0 1 

In [None]:
data = ["Hi, I dont like the you are thinking"]

tidy_tweet = pd.Series(np.vectorize(remove_pattern)(data,"@[\w]*"))

tokenized_tweet = tidy_tweet.apply(lambda x: x.split())
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
tokenized_tidy_tweet = tokenized_tweet
body_len = pd.Series(data).apply(lambda x:len(x) - x.count(" "))
punct = pd.Series(data).apply(lambda x:count_punct(x))
loaded_cv = pickle.load(open('count_vectoriser.sav', 'rb'))
X_result = loaded_cv.transform(tokenized_tidy_tweet)
input = pd.concat([body_len,punct,pd.DataFrame(X_result.toarray())],axis = 1).values
my_prediction = loaded_model.predict(input)
my_prediction

array([0])