In [2]:
# import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import string
import time
import nltk
import re
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import tensorflow as tf
from keras.layers.core import Dense
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.callbacks import EarlyStopping 
from keras.metrics import categorical_accuracy
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
train = pd.read_csv('train.csv') #Training Data File
train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train['summary_text'] = train['Summary'] + ' ' + train['Text']#Combining Summary and body text of review data as one

In [2]:
def countPunctuations(text):
    count=0
    for char in text: 
        if(char in string.punctuation): count+=1
    return count
def toBin(row):
    rowT=row
    M=row.idxmax()
    row.values[:]=0
    row[M]=1
    print(rowT,row)
    return row

In [3]:
#Basic Data Analysis
train['char_count'] = train['summary_text'].apply(len)
temp=train['summary_text'].apply(lambda x: x.split())
train['word_count'] = temp.apply(lambda x: len(x))
train['punctuation_count']=train['summary_text'].apply(countPunctuations)
temp=temp.apply(lambda x :[len(word) for word in x])
train['max_word_length'] =temp.apply(max)
train['min_word_length'] =temp.apply(min)
train['avg_word_length'] =temp.apply(np.mean)
helpfulCols=[ 'HelpfulnessNumerator','HelpfulnessDenominator', 'Score', 'Summary', 'Text', 'summary_text', 'char_count', 'word_count', 'punctuation_count',
       'max_word_length', 'min_word_length', 'avg_word_length']
uselessCols=['Id', 'ProductId', 'UserId', 'ProfileName', 'Time']
trainH=train[helpfulCols]

In [4]:
train.info()
train.describe()
trainH.groupby(train['Score']).describe()

In [5]:
classes=[train["Score"]==i for i in range(1,6)]
classCounts=[(c).sum() for c in classes]
minimum=min(classCounts)
d = pd.DataFrame(0, index=np.arange(minimum*5), columns=["Score","summary_text"])
summary_text=[(train["summary_text"][c])[:minimum].values for c in classes]
score=       [(train["Score"]       [c])[:minimum].values for c in classes]
arr,arr2=np.array([]),np.array([])
for i in score:arr=np.append(arr,i)
for i in summary_text:arr2=np.append(arr2,i)
d["Score"],d["summary_text"]=arr,arr2

# binaryCatagories
for i in range(1,6):
    d["{0}".format(i)]=d["Score"].apply(lambda x:x==i and 1 or 0)
d=d.drop(columns="Score")

# trainX,testX,trainY,testY=train_test_split(d['summary_text'], d[['1','2','3','4','5']])

In [7]:
# optimizations
lemmatizer = WordNetLemmatizer
noStem=lambda x:x
trans=str.maketrans('','',string.punctuation)#removes punctuations
stopwordset=[word.translate(trans) for word in stopwords.words('english')]
stemmer =nltk.stem.SnowballStemmer('english',ignore_stopwords=True)

# preprocessing regexes
stripLinks='((https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*))'#matches links 
stripSpecialChars='([^\ a-zA-Z]+)'#matches special characters 
stripNums="([0-9])"#matches numbers
regex=stripLinks+"|"+stripSpecialChars+"|"+stripNums

#functions 
def filterString(x): 
    return re.sub(regex,'',x)
def lowerStemStopwordsRemove(x,stemmer=stemmer.stem): 
    return ' '.join([stemmer(word) for word in ((x.lower()).split()) if word not in stopwordset])
def filterWithTensor(input_data):#slower than ^^
    trimmed=tf.strings.strip(tf.strings.regex_replace(tf.strings.lower(input_data),regex,''))
    return tf.compat.as_str_any(tf.strings.regex_replace(trimmed,"(\ {2,})",' ').numpy())


In [8]:
d["summary_text"]= d["summary_text"].apply(lambda x:lowerStemStopwordsRemove(filterString(x),noStem))
trainX,testX,trainY,testY=train_test_split(d['summary_text'], d[['1','2','3','4','5']])

In [9]:
ngramConfig=(1,2)    #NGram setup for the models
vocab_size = 10000   #Vocabulary size
sequence_length = 50 #max sentence length for the NN models 
embedding_dim=16     #Embedding layer dims

# def standardize(input_data):
#     lowercase = tf.strings.lower(input_data)
#     filtered=tf.strings.regex_replace(lowercase,regex,'')
#     trimmed=tf.strings.strip(filtered)
#     return tf.strings.regex_replace(trimmed,"(\ {2,})",' ')

vectorize_layer = TextVectorization(
#     standardize=standardize,
    standardize=None,     #don't perform any standardization since we have already done it
    max_tokens=vocab_size, 
    output_mode='int',    #"int", "binary", "count" or "tf-idf"
    ngrams=ngramConfig,
    output_sequence_length=sequence_length#output sequence length
    )

vectorize_layer.adapt(train["summary_text"].tolist())
#NN Call back configurations with early stopping implemented to avoid overfitting 
# callbacks=[tf.keras.callbacks.TensorBoard(log_dir="logs") ,EarlyStopping(monitor='val_loss', min_delta=0, patience=3)]
callbacks=[EarlyStopping(monitor='val_loss', min_delta=0, patience=3)]

In [10]:
# predictions.columns=['1','2','3','4','5']
# testY.columns=['1','2','3','4','5']
# predictions=predictions.apply(toBin,axis=1)
# testcompare=testY.idxmax(axis=1).reset_index().drop(columns="index")
# testcompare==predictions
# pre=predictions.idxmax(axis=1)

# Models 

In [11]:
model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(5, activation='softmax')
])

model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=["accuracy"])
# H=model.fit(
#    trainX, trainY,
#     epochs=15,
#     batch_size=50,
#     validation_data=(testX, testY),
#     callbacks=callbacks)

In [17]:
model = tf.keras.Sequential([
    vectorize_layer,
    Embedding(
              input_dim=len(vectorize_layer.get_vocabulary()),
              output_dim=embedding_dim
            ),
    LSTM(embedding_dim,activation='relu'),
    Dense(embedding_dim, activation='relu'),
    Dense(5,activation='softmax')
])
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
# model.fit(
#     trainX, trainY,
#     epochs=15,
#     batch_size=50,
#     validation_data=(testX, testY),
#     callbacks=callbacks)
# predictions =pd.DataFrame(model.predict(testX))
# predictions.plot.hist()



In [16]:
pipeline = Pipeline([
    ('wordbag', CountVectorizer(analyzer="word",ngram_range=ngramConfig,stop_words="english",binary=False)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', SVC(gamma='auto')),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

# history = model.fit(trainX, trainY, epochs=2000, batch_size=72, verbose=1, shuffle=False)
# predictions=model.predict(testX)