In [2]:
# imports 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.sentiment import SentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
import string
string.punctuation
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler


def process(df):
    # This is where you can do all your processing

    # (0) Drop rows where helpfullness numerator is more than denominator
    df = df[df.HelpfulnessNumerator <= df.HelpfulnessDenominator ]

    # (1) Helpfulness - feature extraction
    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    # (2) Unhelpfulness - feature extraction
    df['UnHelpfulness'] = (df['HelpfulnessDenominator'] - df['HelpfulnessNumerator'])/ df['HelpfulnessDenominator']
    df['UnHelpfulness'] = df['Helpfulness'].fillna(0)

    # (3) Average Score written by each UserId
    df['UserAvgScore'] = df.groupby('UserId')['Score'].transform('mean')
    df['UserAvgScore'] = df['UserAvgScore'].fillna(0)

    """
    # (3) Average Score for each product
    df['ProductAvgScore'] = df.groupby('ProductId')['Score'].transform('mean')
    """

    # (4) counting the number of uppercase characters in reviews
    df['NumUppercase_T'] = df['Text'].str.findall(r'[A-Z]').str.len()
    df['NumUppercase_S'] = df['Summary'].str.findall(r'[A-Z]').str.len()

    # (6) ReviewLength - feature extraction
    df['ReviewLength'] = df.Text.str.split().str.len()
    df['ReviewLength'] = df['ReviewLength'].fillna(0)

    # (7) SummaryLength - feature extraction
    df['SummaryLength'] = df.Summary.str.split().str.len()
    df['SummaryLength'] = df['SummaryLength'].fillna(0)

    # (8) Time Stamps - feature extraction
    df['Date'] = pd.to_datetime(trainingSet['Time'], unit='s')
    #df['Hour'] = df['Date'].dt.hour
    df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year

    # (9) Counting the number of times an exclamation occurs in a review - feature extraction
    sub = "!"
    df['numExclamation_S'] = df['Summary'].str.count(sub)
    df['numExclamation_T'] = df['Text'].str.count(sub)

    # tf-idf vectorization of string 
    tfidf = TfidfVectorizer(ngram_range = (1,3), input='content', analyzer='word', stop_words='english', min_df=0.00001, max_df=0.5, max_features=150000)
    tfidf_encodings = tfidf.fit_transform(df['Text'])
    df['TFIDFText'] = list(tfidf_encodings.toarray())

    """
    # (10) Counting the number of times a question mark occurs in a review - feature extraction
    df['numQuestions_S'] = df['Summary'].apply(lambda x: len(re.findall("?", x)))
    df['numQuestion_T'] = df['Text'].apply(lambda x: len(re.findall("?", x)))
    

    # (11) removing punctuations (pre-processing step)
    df['Summary']= df['Summary'].apply(lambda x:remove_punctuation(x))
    df['Text']= df['Text'].apply(lambda x:remove_punctuation(x))

    # (12) converting everything to lower case (pre-processing step)
    df['Summary']= df['Summary'].apply(lambda x: x.lower())
    df['Text']= df['Text'].apply(lambda x: x.lower())

    """

    # (13) Tokenization (pre-processing step)
    df['Summary']= df['Summary'].apply(lambda x: word_tokenize(x))
    df['Text']= df['Text'].apply(lambda x: word_tokenize(x))

    # (14) removing stop words (pre-processing step)
    df['Summary']= df['Summary'].apply(lambda x: remove_stopwords(x))
    df['Text']= df['Text'].apply(lambda x: remove_stopwords(x))

    # (15) Stemming (pre-processing step)
    df['Summary']= df['Summary'].apply(lambda x: stemming_text(x))
    df['Text']= df['Text'].apply(lambda x: stemming_text(x))

    # (16) cleaned text length - summary and text length after removing stop words
    df['CleanSummaryLength'] = df['Summary'].apply(lambda x: len(x))
    df['CleanSummaryLength'] = df['CleanSummaryLength'].fillna(0)

    df['CleanReviewLength'] = df['Text'].apply(lambda x: len(x))
    df['CleanReviewLength'] = df['CleanReviewLength'].fillna(0)

    


    """
    # (17) Lemmatization of text (pre-processing step)
    df['Summary']= df['Summary'].apply(lambda x: lemmatizing_text(x))
    df['Text']= df['Text'].apply(lambda x: lemmatizing_text(x))
    """

    return df


# Load the dataset
trainingSet = pd.read_csv("./data/train.csv")

# Reducing sample size to 10% - random sampling 
trainingSet = trainingSet.sample(frac=0.1)
#trainingSet = trainingSet.head(50)

# Handeling missing values 
# (1) Remove rows with null score values 
trainingSet = trainingSet[trainingSet['Score'].notnull()]


# text pre-processing :

# (1) converting all float values to strings in summary and text column
trainingSet["Summary"] = trainingSet["Summary"].values.astype('str')
trainingSet["Text"] = trainingSet["Text"].values.astype('str')

# (2) removing punctuations 
def remove_punctuation(text):
    punctuation_removed = "".join([i for i in text if i not in string.punctuation])
    return punctuation_removed

# (4) removing stop words
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    stopwords_removed= [i for i in text if i not in stopwords]
    return stopwords_removed

# (5) stemming of text
snow = SnowballStemmer(language='english')
def stemming_text(text):
    stemmed_text = [snow.stem(word) for word in text]
    return stemmed_text
"""

# (6) Lemmantization of text
lemmatizer = WordNetLemmatizer()
def lemmatizing_text(text):
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_text

"""

# further feature extraction - sentiment analysis and tf-idf vectorization

# sentiment analysis on Review Text
sia = SentimentIntensityAnalyzer()
res = {}
for i, row in trainingSet.iterrows():
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {'index':"Id"})
trainingSet = pd.merge(trainingSet, vaders, left_on='Id', right_on='Id')

# tf-idf vectorization
tfidf = TfidfVectorizer(ngram_range = (1,3), input='content', analyzer='word', stop_words='english', min_df=0.00001, max_df=0.5, max_features=150000)
tfidf_encodings = tfidf.fit_transform(trainingSet['Text'])
trainingSet['TFIDFText'] = list(tfidf_encodings.toarray())



# Standardize the numerical features

# using standard scalar - reduces the influence of outliers and helps converge faster
"""
scaler = StandardScaler()
trainingSet[['Helpfulness', 'UnHelpfulness', 'ReviewLength', 'SummaryLength']] = scaler.fit_transform(trainingSet[['Helpfulness', 'UnHelpfulness', 'ReviewLength', 'SummaryLength']])
trainingSet[['Helpfulness', 'UnHelpfulness', 'ReviewLength', 'SummaryLength']] = scaler.fit_transform(trainingSet[['Helpfulness', 'UnHelpfulness', 'ReviewLength', 'SummaryLength']])
"""

# Process the DataFrame
train_processed = process(trainingSet)


# Load test set
submissionSet = pd.read_csv("./data/test.csv")


# Merge on Id so that the test set can have feature columns as well
testX= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
testX = testX.drop(columns=['Score_x'])
testX = testX.rename(columns={'Score_y': 'Score'})

# The training set is where the score is not null
trainX =  train_processed[train_processed['Score'].notnull()]

# X_test.csv is test.csv with features extracted from train.csv and other features added while generating features
testX.to_csv("./data/X_test.csv", index=False)
trainX.to_csv("./data/X_train.csv", index=False)

# runtime(f 1-1) : 1m 25.9s
# runtime(f 1-3) : 15m 37.4s
# runtime(f 1-3) : 26m 7.5s
# runtime with sentiment analysis : 24m 54.2s

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ShwethaKrishnan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ShwethaKrishnan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [110]:
# see processed dataset
#trainingSet.head(25)
#train_processed.head(25)
train_processed['TFIDFText'].head(25)
#submissionSet.shape
#test = pd.read_csv("./data/test.csv")
#test.shape
#testX.shape
#trainX.shape
#ts.shape
#train_processed.shape
#submissionSet.shape
# runtime 0.1s

#train_processed['ProductAvgScore'].head(50)

0     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
5     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
6     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
7     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
8     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9     [0.09490090756715851, 0.09490090756715851, 0.0...
10    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
11    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
12    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
13    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
14    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
15    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
16    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
17    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0