### Project Task : WEEK 1

#### Class Imbalance Problem

In [None]:
# Load the data
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline

train =  pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
test_val = pd.read_csv('test_data_hidden.csv')
train.head()

In [None]:
Positive = train[train['sentiment']=="Positive"].iloc[:,[5,6,7]]
Negative = train[train['sentiment']=="Negative"].iloc[:,[5,6,7]]
Neutral = train[train['sentiment']=="Neutral"].iloc[:,[5,6,7]]

In [None]:
Positive['sentiment'].value_counts()

In [None]:
Negative['sentiment'].value_counts()

In [None]:
Neutral['sentiment'].value_counts()

#### Converting the Reviews as Tf-Idf Score 

In [None]:
# keeping only those feature that we need for futher exploring 
data = train[['sentiment','reviews.text']]
data.head()

In [None]:
# Using matplotlib to show distribution of reviews sentiment in the data 
print(data.sentiment.value_counts())
data['sentiment'].value_counts().plot(kind='bar')
plt.title('Distribution of Reviews sentiment', size=10)

In [None]:
print(data.shape)

#### Creating and applying Preprocessing on the Data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
wordnet_lematizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'[a-z]+')
stop_words = set(stopwords.words('english'))

def preprocess(document):
    #convert to lowercase
    document = document.lower()
    #tokenize
    words = tokenizer.tokenize(document)
    # Removing stopwords
    words = [w for w in words if not w in stop_words]
    #Lemmatizing
    for pos in [wordnet.NOUN , wordnet.VERB , wordnet.ADJ, wordnet.ADV]:
        words = [wordnet_lematizer.lemmatize(x,pos) for x in words]
    return " ".join(words)    

In [None]:
data['Processed_Review'] = data['reviews.text'].apply(preprocess)
data.head()

In [None]:
data1 = data.drop(['reviews.text'],axis=1)
data1.head()

#### Creating TF-IDF & multinomial Naive Bayes classifier

In [None]:
def TextPreprocessing(data2):
    #Remove Punctuation Logic
    import string
    removePunctuation = [char for char in data2 if char not in string.punctuation]
    #Join Chars to form sentences
    sentenceWithoutPunctuations = ''.join(removePunctuation)
    words = sentenceWithoutPunctuations.split()
    #StopwordRemoval
    from nltk.corpus import stopwords
    removeStopwords = [word for word in words if word.lower() not in stopwords.words('english')]
    
    return removeStopwords

In [None]:
data1.groupby('sentiment').describe()

In [None]:
#Text preprocessing
data1['Processed_Review'].head(2).apply(TextPreprocessing)

In [None]:
#Creating Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
BOW = CountVectorizer(analyzer=TextPreprocessing).fit(data1['Processed_Review'])

In [None]:
len(BOW.vocabulary_)

In [None]:
reviews_BOW = BOW.transform(data1['Processed_Review'])

In [None]:
reviews_BOW

#### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer 
Tfidfdata = TfidfTransformer()
Tfidfdata.fit(reviews_BOW)
TfidfdataFinal = Tfidfdata.transform(reviews_BOW)

In [None]:
TfidfdataFinal.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(TfidfdataFinal,data1['sentiment'])

In [None]:
inputdata = 'very bad i dont like it at all it sucks !'
l1 = TextPreprocessing(inputdata)
l2 = BOW.transform(l1)
l3 = Tfidfdata.transform(l2)
prediction = model.predict(l3[0])
prediction

After running Multinomial Naive Bayes Classifier Everything is classified as positive because of the class imbalance as seen above.

### Tackling Class Imbalance Problem

In [None]:
# Creating independant and dependant feature
columns = data1.columns.tolist()
# Filtering the column remove data that we don't want 
columns = [c for c in columns if c not in ['sentiment']]
# store the variable we're predicting 
target = 'sentiment'
# define a random state
state = np.random.RandomState(42)
X = data1[columns]
Y = data1[target]
print(X.shape)
print(Y.shape)

In [None]:
columns

In [None]:
print(data1['sentiment'].value_counts())

In [None]:
print(Positive.shape, Negative.shape,Neutral.shape)

Apply Oversampling or Undersampling

In [None]:
# RandomOverSampler to handle imbalanced data
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_res,Y_res = ros.fit_resample(X,Y)

In [None]:
from collections import Counter
print(sorted(Counter(Y_res).items()))

In [None]:
print(X_res.shape,Y_res.shape)

In [None]:
# Creating X output to DataFrame
X1 = pd.DataFrame(X_res,columns=['Processed_Review'])
# Creating Y output 
Y1 = pd.DataFrame(Y_res,columns=['sentiment'])
# Merging X & Y 
Final_data = pd.concat([X1,Y1],axis=1)
Final_data.head()

In [None]:
Final_data.info()

In [None]:
df = Final_data.sample(frac=0.1,random_state=0)
df.info()

In [None]:
# We have to drop these missing values
df.dropna(inplace=True)
df.head()

##### Train & Test split the Data 

In [None]:
# Splitting data in trainning set and validation
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df['Processed_Review'],df['sentiment'],test_size=0.2,random_state=0)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

In [None]:
print('Show a review in a training set: \n',X_train.iloc[10])

In [None]:
from bs4 import BeautifulSoup  
import re
from nltk.corpus import stopwords 
from nltk.stem import SnowballStemmer

In [None]:
def cleanText(raw_text, remove_stopwords=False, stemming=False, split_text=False,):
    text = BeautifulSoup(raw_text, 'lxml').get_text()  # Remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # Remove non-character
    words = letters_only.lower().split() # Convert to lower case 
    
    if remove_stopwords: # Remove stopword
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True: # Stemming
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
        
    if split_text==True:  # Split text
        return (words)
    
    return( " ".join(words))

In [None]:
# Preprocess text data in validation set and training set
X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanText(d))
print('Show a cleaned review in training set :\n',X_train_cleaned[10])
for d in X_test:
    X_test_cleaned.append(cleanText(d))
print('Show a cleaned review in validation set :\n',X_test_cleaned[10])    

In [None]:
# Fit and transform the training data to a document-term matrix using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
countvect = CountVectorizer()
X_train_countvect = countvect.fit_transform(X_train_cleaned)

In [None]:
print("Number of features:",len(countvect.get_feature_names()))
print("Show some features : \n",countvect.get_feature_names()[::1000])

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_countvect,Y_train)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score
def modelEvaluation(predictions):
    print('Accuracy on validation set :{:.4f}'.format(accuracy_score(Y_test,predictions)))
    print('\nClassification report :\n',classification_report(Y_test,predictions))
    print('\nConfusion matrix :\n',confusion_matrix(Y_test,predictions))

In [None]:
# Evaluate the model on validation set
X_test_countvect = countvect.transform(X_test_cleaned)
predictions = mnb.predict(X_test_countvect)

In [None]:
modelEvaluation(predictions)

Tfidf Vectorizer with Logistic Regression

In [None]:
# Fitting and transforming the training data to a document-term matrix with TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf = TfidfVectorizer(min_df=5)
X_train_Tfidf = Tfidf.fit_transform(X_train)

In [None]:
print("Number of features:",len(Tfidf.get_feature_names()))
print("Show some feature names :", Tfidf.get_feature_names()[::1000])

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_Tfidf,Y_train)

In [None]:
# Evaluating on Validation set 
predictions = lr.predict(Tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)

TfidfVectorizer with Linear SVM by using SGD

In [None]:
# Fitting and Transforming the training data to a document-term using TfidfVectorizer 
Tfidf = TfidfVectorizer(min_df=5)
X_train_Tfidf = Tfidf.fit_transform(X_train)
print("Number of features:",len(Tfidf.get_feature_names()))
print("Show some feature names :", Tfidf.get_feature_names()[::1000])
# SVM
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train_Tfidf,Y_train)

In [None]:
# Evaluationg on the Validation set 
predictions = clf.predict(Tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)

Using XGBoost Classifier

In [None]:
# XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_Tfidf,Y_train)

In [None]:
# Evaluating on the Validation set 
predictions = xgb.predict(Tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)

Pipeline and GridSearchCV

In [None]:
# Building a pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
estimators = [("tfidf", TfidfVectorizer()), ("lr", LogisticRegression())]
model = Pipeline(estimators)


# Grid search
params = {"lr__C":[0.1, 1, 10], #regularization param of logistic regression
          "tfidf__min_df": [1, 3], #min count of words 
          "tfidf__max_features": [1000, None], #max features
          "tfidf__ngram_range": [(1,1), (1,2)], #1-grams or 2-grams
          "tfidf__stop_words": [None, "english"]} #use stopwords or don't

grid = GridSearchCV(estimator=model, param_grid=params, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_cleaned, Y_train)
print("The best paramenter set is : \n", grid.best_params_)

In [None]:
# Evaluate on the validation set 
predictions = grid.predict(X_test_cleaned)
modelEvaluation(predictions)

### Project Task: Week 2

#### Model Selection:

In [None]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rdc = RandomForestClassifier()
rdc.fit(X_train_Tfidf,Y_train)

In [None]:
# Evaluating on the validation set 
predictions = rdc.predict(Tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)

In [None]:
df = Final_data.sample(frac=0.1,random_state=0)
# Drop missing values 
df.dropna(inplace=True)
# Convert the sentiments 
df.sentiment.replace(('Positive','Negative','Neutral'),(1,0,2),inplace=True)
df.head()

In [None]:
# Splitting data into trainning data and validation 
X_train,X_test,y_train,y_test = train_test_split(df['Processed_Review'],df['sentiment'],test_size=0.2,random_state=1)

In [None]:
top_words = 20000
maxlen = 100
batch_size = 32
nb_classes = 3
epoch = 5
from keras.preprocessing.text import Tokenizer
# Vectorize X_train and X_test to 2D tensor 
tokenizer = Tokenizer(num_words=top_words) # converting only tof 20000 in corpus 
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
from keras.preprocessing import sequence
X_train_seq = sequence.pad_sequences(sequences_train, maxlen=maxlen)
X_test_seq = sequence.pad_sequences(sequences_test, maxlen=maxlen)

# One-Hot Encoding of y_test and y_train 
from keras.utils import np_utils
y_test_seq = np_utils.to_categorical(y_test,nb_classes)
y_train_seq = np_utils.to_categorical(y_train,nb_classes)
print('X_train shape :', X_train_seq.shape)
print('X_test shape :', X_test_seq.shape)
print('y_train shape :', y_train_seq.shape)
print('y_test shape :', y_test_seq.shape)

#### Apply LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers import LSTM

In [None]:
# Constructing a Simple LSTM
model1 = Sequential()
model1.add(Embedding(top_words, 128))
model1.add(LSTM(128)) 
model1.add(Dropout(0.2))
model1.add(Dense(nb_classes))
model1.add(Activation('softmax'))
model1.summary()

# Compiling LSTM
model1.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model1.fit(X_train_seq, y_train_seq, batch_size=batch_size, epochs=epoch, verbose=1)

# Model Evaluation
score = model1.evaluate(X_test_seq, y_test_seq, batch_size=batch_size)
print('Test loss : {:.4f}'.format(score[0]))
print('Test accuracy : {:.4f}'.format(score[1]))

In [None]:
# Getting weight matrix of the embedding layer
model1.layers[0].get_weights()[0] # weight matrix of the embedding layer, word-by-dim matrix
print("Size of weight matrix in the embedding layer : ",model1.layers[0].get_weights()[0].shape)

# Getting weight matrix of the hidden layer
print("Size of weight matrix in the hidden layer : ",model1.layers[1].get_weights()[0].shape) 

# Getting weight matrix of the output layer
#print("Size of weight matrix in the output layer : ", model1.layers[2].get_weights()[0].shape) 

#### Topic Modeling 

Latent Dirichet Allocation (LDA)

In [None]:
import nltk 
nltk.download('wordnet')

doc_complete = data1['Processed_Review'].tolist()
doc_clean = [cleanText(doc).split() for doc in doc_complete]


In [None]:
import gensim
from gensim import corpora

In [None]:
dictionary =  corpora.Dictionary(doc_clean)
print(dictionary)

In [None]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
print(len(doc_term_matrix))

In [None]:
from gensim.models import LdaModel
NUM_TOPICS = 9
ldamodel = LdaModel(doc_term_matrix,num_topics=NUM_TOPICS,id2word=dictionary,passes=30)

In [None]:
topics = ldamodel.show_topics()
for topic in topics:
    print(topic)
    print()

In [None]:
word_dict = {}
for i in range(NUM_TOPICS):
    words = ldamodel.show_topic(i, topn = 20)
    word_dict["Topic # " + "{}".format(i)] = [i[0] for i in words]

In [None]:
pd.DataFrame(word_dict)

In [None]:
ldamodel.show_topic(0, topn=20)

Creating a Wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS
txt = data1["Processed_Review"].values
wc = WordCloud(width=200, height=100, background_color="white", stopwords=STOPWORDS).generate(str(txt))
fig = plt.figure(figsize=(20,20), facecolor='k', edgecolor='w')
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.show()

Displaying Results & Getting Insights

In [None]:
!pip install pyLDAvis

In [None]:
import pyLDAvis.gensim

In [None]:
Lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False)
pyLDAvis.display(Lda_display)