In [5]:
#A notebook for Tweet classification by Jonathan Ivy

In [7]:
#Tweet Classifier
#Import Libraries
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files #using the load file function, we import dataset
#nltk.download('stopwords') - Not needed if already up to date
import pandas as pd

In [3]:
#Import Dataset
tweets = load_files('txt_tweetsent/') #when we put / load_files loops inside all sub-directories
#based on our folders, it will create two classes, neg (class zero) and pos (class one)

In [None]:
#Now we need to seperate the class and the document
X,y = tweets.data, tweets.target

In [None]:
#Load_file function for large datasets is time consuming
#To get away, we store X,Y as a pickle file - Save file on disk
#Store pickle file
with open ('X.pickle','wb') as f:#wb, write-byte
    pickle.dump (X,f)

In [None]:
with open ('y.pickle','wb') as f:#wb, write-byte
    pickle.dump (y,f)

In [None]:
#Unpickling the dataset
with open ('X.pickle','rb') as f:#rb, read-byte
    X=pickle.load(f)

with open ('y.pickle','rb') as f:#rb, read-byte
    y=pickle.load(f)

In [None]:
#Preprocessing X
#Creating a Corpus - A list Corpus will contain all pre-processed documents
corpus=[]
for i in range (0,len(X)):
    tweet=re.sub(r"^http://t.co/[a-zA-Z09]*\s"," ",tweet) #remove links
    tweet=re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ",tweet)
    tweet=re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ",tweet)
    tweet=tweet.lower()
    tweet=re.sub(r"that's","that is", tweet)
    tweet=re.sub(r"there's","there is", tweet)
    tweet=re.sub(r"what's","what is", tweet)
    tweet=re.sub(r"where's","where is", tweet)
    tweet=re.sub(r"it's","it is", tweet)
    tweet=re.sub(r"who's","who is", tweet)
    tweet=re.sub(r"i'm","i am", tweet)
    tweet=re.sub(r"she's","she is", tweet)
    tweet=re.sub(r"he's","he is", tweet)
    tweet=re.sub(r"they're","they are", tweet)
    tweet=re.sub(r"who're","who are", tweet)
    tweet=re.sub(r"ain't","am not", tweet)
    tweet=re.sub(r"wouldn't","would not", tweet)
    tweet=re.sub(r"can't","can not", tweet)
    tweet=re.sub(r"could't","could not", tweet)
    tweet=re.sub(r"won't","wil not", tweet)
    tweet=re.sub(r"\W"," ", tweet)
    tweet=re.sub(r"\d"," ", tweet)
    tweet=re.sub(r"\s+[a-z]\s+"," ", tweet)
    tweet=re.sub(r"\s+[a-z]$"," ", tweet)
    tweet=re.sub(r"^[a-z]\+s"," ", tweet)
    tweet=re.sub(r"\s+"," ", tweet)
    corpus.append(tweet) #Append each
norm_corpus=corpus

In [None]:
#Create Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
#Create an object of count vectorizer, max_features is top n most frequent words as features
#min_df if a word appear in less than 2, it would be removed
#max_df exlucde all words that appear in 60% of documents or more than than (e.g. the) - focus only on important words
#finally exclude all stop words that are in the list specified
vectorizer = CountVectorizer(max_features=2000,min_df=2, max_df=0.6, stop_words=stopwords.words('english'))
#now create BOW model
X = vectorizer.fit_transform(norm_corpus).toarray()

In [4]:
X.shape

NameError: name 'X' is not defined

In [None]:
#Convert BOW to TF_IDF
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()

In [None]:
X=transformer.fit_transform(X).toarray()

In [None]:
X

In [None]:
#Split Dataset into Train and Test
from sklearn.model_selection import train_test_split
text_train, text_test,sent_train, sent_test = train_test_split (X,y,test_size=0.2, random_state=0) 
#X tfidf mode, y list of classes (0,1), for each X we have a y, random state helps all get same results
#80% for training, 20% for test, 
#It returns four arguments that we need to save
#text_train list of documents we will use for model training, text_test document to test model performance
#sent_train diff sentiment classes associated with text_train

In [None]:
text_train.shape

In [None]:
text_test.shape

In [None]:
sent_train.shape

In [None]:
sent_test.shape

In [None]:
#Logistic Regression - Training Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='newton-cg')
classifier.fit(text_train,sent_train) # we fit the model

In [None]:
#Logistic Regression - Test Classifier - Evaluate Model Performance
sent_pred = classifier.predict(text_test) #list of all preditions

In [None]:
#Now we need to compare sent_pred with sent_test
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(sent_test,sent_pred)

In [None]:
cm

In [None]:
#total correct predictions out of 400
tot=cm[0][0]+cm[1][1]
print (tot)
#Model accuracy
print("Accuracy:",(tot/cm.sum()))

In [None]:
#Model precision
prec=round((cm[1][1]/(cm[1][1]+cm[0][1])),4)
print ("Precision:",prec)

In [None]:
#Model recall - hit rate
rec=round((cm[1][1]/(cm[1][1]+cm[1][0])),4)
print ("Recall:",rec)

In [None]:
#Model F1-score - hit rate
F1_score = round(2*prec*rec/(prec+rec),4)
print ("F1 Score:",F1_score)

In [None]:
#Save the model to re-use!
#Pickling the Classifier
with open ('classifier.pickle','wb') as f:#rb, write-byte
    pickle.dump(classifier,f)
#First we need to use TfidVectorizer

In [8]:
#Directly use the TfidVectorizer
#Unpickling the dataset
with open ('X.pickle','rb') as f:#rb, read-byte
    X=pickle.load(f)

with open ('y.pickle','rb') as f:#rb, read-byte
    y=pickle.load(f)

#Creating a Corpus - A list Corpus will contain all pre-processed documents
corpus=[]
for i in range (0,len(X)):
    tweet=re.sub(r"^http://t.co/[a-zA-Z09]*\s"," ",tweet) #remove links
    tweet=re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ",tweet)
    tweet=re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ",tweet)
    tweet=tweet.lower()
    tweet=re.sub(r"that's","that is", tweet)
    tweet=re.sub(r"there's","there is", tweet)
    tweet=re.sub(r"what's","what is", tweet)
    tweet=re.sub(r"where's","where is", tweet)
    tweet=re.sub(r"it's","it is", tweet)
    tweet=re.sub(r"who's","who is", tweet)
    tweet=re.sub(r"i'm","i am", tweet)
    tweet=re.sub(r"she's","she is", tweet)
    tweet=re.sub(r"he's","he is", tweet)
    tweet=re.sub(r"they're","they are", tweet)
    tweet=re.sub(r"who're","who are", tweet)
    tweet=re.sub(r"ain't","am not", tweet)
    tweet=re.sub(r"wouldn't","would not", tweet)
    tweet=re.sub(r"can't","can not", tweet)
    tweet=re.sub(r"could't","could not", tweet)
    tweet=re.sub(r"won't","wil not", tweet)
    tweet=re.sub(r"\W"," ", tweet)
    tweet=re.sub(r"\d"," ", tweet)
    tweet=re.sub(r"\s+[a-z]\s+"," ", tweet)
    tweet=re.sub(r"\s+[a-z]$"," ", tweet)
    tweet=re.sub(r"^[a-z]\+s"," ", tweet)
    tweet=re.sub(r"\s+"," ", tweet)
    corpus.append(tweet) #Append each
norm_corpus=corpus

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000, min_df=2,max_df=0.6,stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(norm_corpus).toarray()

NameError: name 'tweet' is not defined

In [None]:
#Pickling the Vectorizer
with open ('tfidfmodel.pickle','wb') as f:#rb, write-byte
    pickle.dump(vectorizer,f)

In [None]:
#Importing the Model (Classifier/Vectorizer)
#For a sentence we will say if its polarity is negative or positive
#Unpickling the classifier - Vectorizer
with open ('classifier.pickle','rb') as f:#rb, read-byte
    clf=pickle.load(f)
with open ('tfidfmodel.pickle','rb') as f:#rb, read-byte
    tfidf=pickle.load(f)

In [None]:
#Now, use it
sample = ["Enter Tweet here"]
#We do not want to fit, we just want to transform based on an already trained model
sample = tfidf.transform(sample).toarray()

In [None]:
print("0 is negative & 1 is positive:", clf.predict(sample)) 